[
  {
    "path": ".gitignore",
    "content": "# Ignore Git here\n.git\n\n# But not these files...\n# !.gitignore\n\ncheckpoints/test_local/opt.txt\nPWCNet/pwc_net.pth.tar\nMegaDepth/checkpoints/*\nmodel_weights/*\nMiddleBurySet/*\n\n.nfs*\n\n# Created by .ignore support plugin (hsz.mobi)\n### Python template\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*,cover\n.hypothesis/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# IPython Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# dotenv\n.env\n\n# virtualenv\nvenv/\nENV/\n\n# Spyder project settings\n.spyderproject\n\n# Rope project settings\n.ropeproject\n### VirtualEnv template\n# Virtualenv\n# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/\n.Python\n[Bb]in\n[Ii]nclude\n[Ll]ib\n[Ll]ib64\n[Ll]ocal\n[Ss]cripts\npyvenv.cfg\n.venv\npip-selfcheck.json\n### JetBrains template\n# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm\n# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839\n\n# User-specific stuff:\n.idea/workspace.xml\n.idea/tasks.xml\n.idea/dictionaries\n.idea/vcs.xml\n.idea/jsLibraryMappings.xml\n\n# Sensitive or high-churn files:\n.idea/dataSources.ids\n.idea/dataSources.xml\n.idea/dataSources.local.xml\n.idea/sqlDataSources.xml\n.idea/dynamic.xml\n.idea/uiDesigner.xml\n\n# Gradle:\n.idea/gradle.xml\n.idea/libraries\n\n# Mongo Explorer plugin:\n.idea/mongoSettings.xml\n\n.idea/\n\n## File-based project format:\n*.iws\n\n## Plugin-specific files:\n\n# IntelliJ\n/out/\n\n# mpeltonen/sbt-idea plugin\n.idea_modules/\n\n# JIRA plugin\natlassian-ide-plugin.xml\n\n# Crashlytics plugin (for Android Studio and IntelliJ)\ncom_crashlytics_export_strings.xml\ncrashlytics.properties\ncrashlytics-build.properties\nfabric.properties\n\n"
  },
  {
    "path": "AverageMeter.py",
    "content": "\n\nclass AverageMeter(object):\n    \"\"\"Computes and stores the average and current value\"\"\"\n    def __init__(self):\n        self.reset()\n\n    def reset(self):\n        self.val = 0\n        self.avg = 0\n        self.sum = 0\n        self.count = 0\n\n    def update(self, val, n=1):\n        self.val = val\n        self.sum += val * n\n        self.count += n\n        self.avg = self.sum / self.count\n"
  },
  {
    "path": "Colab_DAIN.ipynb",
    "content": "{\n  \"nbformat\": 4,\n  \"nbformat_minor\": 0,\n  \"metadata\": {\n    \"colab\": {\n      \"name\": \"Colab_DAIN_new.ipynb\",\n      \"private_outputs\": true,\n      \"provenance\": [],\n      \"collapsed_sections\": [],\n      \"toc_visible\": true\n    },\n    \"kernelspec\": {\n      \"name\": \"python3\",\n      \"display_name\": \"Python 3\"\n    },\n    \"accelerator\": \"GPU\"\n  },\n  \"cells\": [\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"1pIo4r_Y8cMo\"\n      },\n      \"source\": [\n        \"# DAIN Colab\"\n      ]\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"iGPHW5SOpPe3\"\n      },\n      \"source\": [\n        \"*DAIN Colab, v1.6.0*\\n\",\n        \"\\n\",\n        \"Based on the [original Colab file](https://github.com/baowenbo/DAIN/issues/44) by btahir. \\n\",\n        \"\\n\",\n        \"Enhancements by [Styler00Dollar](https://github.com/styler00dollar) aka \\\"sudo rm -rf / --no-preserve-root#8353\\\" on discord and [Alpha](https://github.com/AlphaGit), (Alpha#6137 on Discord). Please do not run this command in your linux terminal. It's rather meant as a joke.\\n\",\n        \"\\n\",\n        \"[Styler00Dollar's fork](https://github.com/styler00dollar/DAIN) / [Alpha's fork](https://github.com/AlphaGit/DAIN)\\n\",\n        \"\\n\",\n        \"A simple guide:\\n\",\n        \"- Upload this ` .ipynb`  file to your Google Colab.\\n\",\n        \"- Create a folder inside of Google Drive named \\\"DAIN\\\"\\n\",\n        \"- Change the configurations in the next cell\\n\",\n        \"- Run cells one by one\\n\",\n        \"\\n\",\n        \"Stuff that should be improved:\\n\",\n        \"- Alpha channel will be removed automatically and won't be added back. Anything related to alpha will be converted to black.\\n\",\n        \"- Adding configuration to select speed\\n\",\n        \"- Detect scenes to avoid interpolating scene-changes\\n\",\n        \"- Auto-resume\\n\",\n        \"- Copy `start_frame` - `end_frame` audio from original input to final output\\n\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"enKoi0TR2fOD\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"################# Required Configurations ############################\\n\",\n        \"\\n\",\n        \"#@markdown # Required Configuration\\n\",\n        \"#@markdown Use the values in here to configure what you'd like DAIN to do.\\n\",\n        \"\\n\",\n        \"#@markdown ## Input file\\n\",\n        \"#@markdown Path (relative to the root of your Google Drive) to the input file. For instance, if you save your `example.mkv` file in your Google Drive, inside a `videos` folder, the path would be: `videos/example.mkv`. Currenly videos and gifs are supported.\\n\",\n        \"INPUT_FILEPATH = \\\"DAIN/input.mp4\\\" #@param{type:\\\"string\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## Output file\\n\",\n        \"#@markdown Output file path: path (relative to the root of your Google Drive) for the output file. It will also determine the filetype in the destination. `.mp4` is recommended for video input, `.gif` for gif inputs.\\n\",\n        \"OUTPUT_FILE_PATH = \\\"DAIN/output.mp4\\\" #@param{type:\\\"string\\\"}\\n\",\n        \"\\n\",\n        \"################# Optional configurations ############################\\n\",\n        \"\\n\",\n        \"#@markdown # Optional Configuration\\n\",\n        \"#@markdown Parameters below can be left with their defaults, but feel free to adapt them to your needs.\\n\",\n        \"\\n\",\n        \"#@markdown ## Target FPS\\n\",\n        \"#@markdown  how many frames per second should the result have. This will determine how many intermediate images are interpolated.\\n\",\n        \"TARGET_FPS = 60 #@param{type:\\\"number\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## Frame input directory\\n\",\n        \"#@markdown A path, relative to your GDrive root, where you already have the list of frames in the format 00001.png, 00002.png, etc.\\n\",\n        \"FRAME_INPUT_DIR = '/content/DAIN/input_frames' #@param{type:\\\"string\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## Frame output directory\\n\",\n        \"#@markdown A path, relative to your GDrive root, where you want the generated frame.\\n\",\n        \"FRAME_OUTPUT_DIR = '/content/DAIN/output_frames' #@param{type:\\\"string\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## Start Frame\\n\",\n        \"#@markdown First frame to consider from the video when processing.\\n\",\n        \"START_FRAME = 1 #@param{type:\\\"number\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## End Frame\\n\",\n        \"#@markdown Last frame to consider from the video when processing. To use the whole video use `-1`.\\n\",\n        \"END_FRAME = -1 #@param{type:\\\"number\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## Seamless playback\\n\",\n        \"#@markdown Creates a seamless loop by using the first frame as last one as well. Set this to True this if loop is intended.\\n\",\n        \"SEAMLESS = False #@param{type:\\\"boolean\\\"}\\n\",\n        \"\\n\",\n        \"#@markdown ## Auto-remove PNG directory\\n\",\n        \"#@markdown Auto-delete output PNG dir after ffmpeg video creation. Set this to `False` if you want to keep the PNG files.\\n\",\n        \"AUTO_REMOVE = True #@param{type:\\\"boolean\\\"}\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"N9cGwalNeyk9\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title Connect Google Drive\\n\",\n        \"from google.colab import drive\\n\",\n        \"drive.mount('/content/gdrive')\\n\",\n        \"print('Google Drive connected.')\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"irzjv1x4e3S4\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title Check your current GPU\\n\",\n        \"# If you are lucky, you get 16GB VRAM. If you are not lucky, you get less. VRAM is important. The more VRAM, the higher the maximum resolution will go.\\n\",\n        \"\\n\",\n        \"# 16GB: Can handle 720p. 1080p will procude an out-of-memory error. \\n\",\n        \"# 8GB: Can handle 480p. 720p will produce an out-of-memory error.\\n\",\n        \"\\n\",\n        \"!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"markdown\",\n      \"metadata\": {\n        \"id\": \"UYHTTP91oMvh\"\n      },\n      \"source\": [\n        \"# Install dependencies.\\n\",\n        \"\\n\",\n        \"This next step may take somewhere between 15-20 minutes. Run this only once at startup.\\n\",\n        \"\\n\",\n        \"Look for the \\\"Finished installing dependencies\\\"  message.\"\n      ]\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"e5AHGetTRacZ\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title Setup everything. This takes a while. Just wait ~20 minutes in total.\\n\",\n        \"\\n\",\n        \"# Install old pytorch to avoid faulty output\\n\",\n        \"%cd /content/\\n\",\n        \"!wget -c https://repo.anaconda.com/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh\\n\",\n        \"!chmod +x Miniconda3-4.5.4-Linux-x86_64.sh\\n\",\n        \"!bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local\\n\",\n        \"!conda install pytorch==1.1 cudatoolkit torchvision -c pytorch -y\\n\",\n        \"!conda install ipykernel -y\\n\",\n        \"\\n\",\n        \"!pip install scipy==1.1.0\\n\",\n        \"!pip install imageio\\n\",\n        \"!CUDA_VISIBLE_DEVICES=0\\n\",\n        \"!sudo apt-get install imagemagick imagemagick-doc\\n\",\n        \"print(\\\"Finished installing dependencies.\\\")\\n\",\n        \"\\n\",\n        \"# Clone DAIN sources\\n\",\n        \"%cd /content\\n\",\n        \"!git clone -b master --depth 1 https://github.com/baowenbo/DAIN /content/DAIN\\n\",\n        \"%cd /content/DAIN\\n\",\n        \"!git log -1\\n\",\n        \"\\n\",\n        \"# Building DAIN\\n\",\n        \"%cd /content/DAIN/my_package/\\n\",\n        \"!./build.sh\\n\",\n        \"print(\\\"Building #1 done.\\\")\\n\",\n        \"\\n\",\n        \"# Building DAIN PyTorch correlation package.\\n\",\n        \"%cd /content/DAIN/PWCNet/correlation_package_pytorch1_0\\n\",\n        \"!./build.sh\\n\",\n        \"print(\\\"Building #2 done.\\\")\\n\",\n        \"\\n\",\n        \"# Downloading pre-trained model\\n\",\n        \"%cd /content/DAIN\\n\",\n        \"!mkdir model_weights\\n\",\n        \"!wget -O model_weights/best.pth http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"zm5kn6vTncL4\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title Detecting FPS of input file.\\n\",\n        \"%shell yes | cp -f /content/gdrive/My\\\\ Drive/{INPUT_FILEPATH} /content/DAIN/\\n\",\n        \"\\n\",\n        \"import os\\n\",\n        \"filename = os.path.basename(INPUT_FILEPATH)\\n\",\n        \"\\n\",\n        \"import cv2\\n\",\n        \"cap = cv2.VideoCapture(f'/content/DAIN/{filename}')\\n\",\n        \"\\n\",\n        \"fps = cap.get(cv2.CAP_PROP_FPS)\\n\",\n        \"print(f\\\"Input file has {fps} fps\\\")\\n\",\n        \"\\n\",\n        \"if(fps/TARGET_FPS>0.5):\\n\",\n        \"  print(\\\"Define a higher fps, because there is not enough time for new frames. (Old FPS)/(New FPS) should be lower than 0.5. Interpolation will fail if you try.\\\")\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"9YNva-GuKq4Y\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title ffmpeg extract - Generating individual frame PNGs from the source file.\\n\",\n        \"%shell rm -rf '{FRAME_INPUT_DIR}'\\n\",\n        \"%shell mkdir -p '{FRAME_INPUT_DIR}'\\n\",\n        \"\\n\",\n        \"if (END_FRAME==-1):\\n\",\n        \"  %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=gte(n\\\\,{START_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\\n\",\n        \"else:\\n\",\n        \"  %shell ffmpeg -i '/content/DAIN/{filename}' -vf 'select=between(n\\\\,{START_FRAME}\\\\,{END_FRAME}),setpts=PTS-STARTPTS' '{FRAME_INPUT_DIR}/%05d.png'\\n\",\n        \"\\n\",\n        \"from IPython.display import clear_output\\n\",\n        \"clear_output()\\n\",\n        \"\\n\",\n        \"png_generated_count_command_result = %shell ls '{FRAME_INPUT_DIR}' | wc -l\\n\",\n        \"frame_count = int(png_generated_count_command_result.output.strip())\\n\",\n        \"\\n\",\n        \"import shutil\\n\",\n        \"if SEAMLESS:\\n\",\n        \"  frame_count += 1\\n\",\n        \"  first_frame = f\\\"{FRAME_INPUT_DIR}/00001.png\\\"\\n\",\n        \"  new_last_frame = f\\\"{FRAME_INPUT_DIR}/{frame_count.zfill(5)}.png\\\"\\n\",\n        \"  shutil.copyfile(first_frame, new_last_frame)\\n\",\n        \"\\n\",\n        \"print(f\\\"{frame_count} frame PNGs generated.\\\")\\n\",\n        \"\\n\",\n        \"#Checking if PNGs do have alpha\\n\",\n        \"import subprocess as sp\\n\",\n        \"%cd {FRAME_INPUT_DIR}\\n\",\n        \"channels = sp.getoutput('identify -format %[channels] 00001.png')\\n\",\n        \"print (f\\\"{channels} detected\\\")\\n\",\n        \"\\n\",\n        \"# Removing alpha if detected\\n\",\n        \"if \\\"a\\\" in channels:\\n\",\n        \"  print(\\\"Alpha channel detected and will be removed.\\\")\\n\",\n        \"  print(sp.getoutput('find . -name \\\"*.png\\\" -exec convert \\\"{}\\\" -alpha off PNG24:\\\"{}\\\" \\\\;'))\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"W3rrE7L824gL\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title Interpolation\\n\",\n        \"%shell mkdir -p '{FRAME_OUTPUT_DIR}'\\n\",\n        \"%cd /content/DAIN\\n\",\n        \"\\n\",\n        \"!python -W ignore colab_interpolate.py --netName DAIN_slowmotion --time_step {fps/TARGET_FPS} --start_frame 1 --end_frame {frame_count} --frame_input_dir '{FRAME_INPUT_DIR}' --frame_output_dir '{FRAME_OUTPUT_DIR}'\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"TKREDli2IDMV\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title Create output video\\n\",\n        \"%cd {FRAME_OUTPUT_DIR}\\n\",\n        \"%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\\n\",\n        \"\\n\",\n        \"if(AUTO_REMOVE):\\n\",\n        \"  !rm -rf {FRAME_OUTPUT_DIR}/*\\n\",\n        \"\\n\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"UF5TEo5N374o\",\n        \"cellView\": \"form\"\n      },\n      \"source\": [\n        \"#@title [Experimental] Create video with sound\\n\",\n        \"# Only run this, if the original had sound.\\n\",\n        \"%cd {FRAME_OUTPUT_DIR}\\n\",\n        \"%shell ffmpeg -i '/content/DAIN/{filename}' -acodec copy output-audio.aac\\n\",\n        \"%shell ffmpeg -y -r {TARGET_FPS} -f image2 -pattern_type glob -i '*.png' -i output-audio.aac -shortest '/content/gdrive/My Drive/{OUTPUT_FILE_PATH}'\\n\",\n        \"\\n\",\n        \"if (AUTO_REMOVE):\\n\",\n        \"  !rm -rf {FRAME_OUTPUT_DIR}/*\\n\",\n        \"  !rm -rf output-audio.aac\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    }\n  ]\n}\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2019 Wenbo Bao\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "MegaDepth/LICENSE",
    "content": "MIT License\n\nCopyright (c) 2018 Zhengqi Li\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "MegaDepth/MegaDepth_model.py",
    "content": "import torch\nimport sys\nfrom torch.autograd import Variable\nimport numpy as np\nfrom .options.train_options import TrainOptions\nfrom .models.models import create_model\n__all__ = ['HourGlass']\n\n\n\ndef HourGlass(pretrained=None):\n    \"\"\"Constructs a ResNet-18 model.\n\n    Args:\n        pretrained (bool): If True, returns a model pre-trained on ImageNet\n    \"\"\"\n\n    opt = TrainOptions().parse()  # set CUDA_VISIBLE_DEVICES before import torch\n    model = create_model(opt,pretrained)\n    #netG is the real nn.Module\n    return model.netG\n"
  },
  {
    "path": "MegaDepth/README.md",
    "content": "# MegaDepth: Learning Single-View Depth Prediction from Internet Photos\n\nThis is a code of the algorithm described in \"MegaDepth: Learning Single-View Depth Prediction from Internet Photos, Z. Li and N. Snavely, CVPR 2018\". The code skeleton is based on \"https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix\". If you use our code or models for academic purposes, please consider citing:\n\n    @inproceedings{MDLi18,\n\t  \ttitle={MegaDepth: Learning Single-View Depth Prediction from Internet Photos},\n\t  \tauthor={Zhengqi Li and Noah Snavely},\n\t  \tbooktitle={Computer Vision and Pattern Recognition (CVPR)},\n\t  \tyear={2018}\n\t}\n\n#### Examples of single-view depth predictions on the photos we randomly downloaded from Internet:\n<img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo.jpg\" width=\"300\"/> <img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo.png\" width=\"300\"/>\n<img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_2.jpg\" width=\"300\"/> <img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_2.png\" width=\"300\"/>\n<img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_3.jpg\" width=\"300\"/> <img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_3.png\" width=\"300\"/>\n<img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_4.jpg\" width=\"300\"/> <img src=\"https://github.com/lixx2938/MegaDepth/blob/master/demo_img/demo_4.png\" width=\"300\"/>\n\n#### Dependencies:\n* The code was written in Pytorch 0.2 and Python 2.7, but it should be easy to adapt it to Python 3 and latest Pytorch version if needed.\n* You might need skimage, h5py libraries installed for python before running the code.\n\n#### Single-view depth prediction on any Internet photo:\n* Download pretrained model from: http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth and put it in \"checkpoints/test_local/best_generalization_net_G.pth\n* In python file \"models/HG_model.py\", in init function, change to \"model_parameters = self.load_network(model, 'G', 'best_generalization')\"\n* run demo code \n```bash\n    python demo.py\n```\nYou should see an inverse depth prediction saved as demo.png from an original photo demo.jpg. If you want to use RGB maps for visualization, like the figures in our paper, you have to install/run semantic segmentation from https://github.com/kazuto1011/pspnet-pytorch trained on ADE20K to mask out sky, because inconsistent depth prediction of unmasked sky will not make RGB visualization resonable.\n\n\n#### Evaluation on the MegaDepth test splits:\n* Download MegaDepth V1 dataset from project website: http://www.cs.cornell.edu/projects/megadepth/.\n* Download pretrained model (specific for MD dataset) from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_vanila_net_G.pth and put it in \"checkpoints/test_local/best_vanila_net_G.pth\" \n* Download test list files from http://www.cs.cornell.edu/projects/megadepth/dataset/data_lists/test_lists.tar.gz, it should include two folders corresponding to images with landscape and portrait orientations.\n* To compute scale invarance RMSE on MD testset, change the variable \"dataset_root\" in python file \"rmse_error_main.py\" to the root directory of MegaDepth_v1 folder, and change variable \"test_list_dir_l\" and \"test_list_dir_p\" to corresponding folder paths of test lists, and run:\n```bash\n    python rmse_error_main.py\n```\n* To compute Structure from Motion Disagreement Rate (SDR), change the variable \"dataset_root\" in python file \"rmse_error_main.py\" to the root directory of MegaDepth_v1 folder, and change variable \"test_list_dir_l\" and \"test_list_dir_p\" to corresponding folder paths of test lists, and run:\n```bash\n    python SDR_compute.py\n```\n* If you want to run our model on arbitrary Internet photos, please download pretrained model from http://www.cs.cornell.edu/projects/megadepth/dataset/models/best_generalization_net_G.pth, which has much better generalization ability (qualitatively speaking) to completely unknown scenes.\n\n"
  },
  {
    "path": "MegaDepth/SDR_compute.py",
    "content": "import time\nimport torch\nimport sys\n\nfrom options.train_options import TrainOptions\nopt = TrainOptions().parse()  # set CUDA_VISIBLE_DEVICES before import torch\nfrom data.data_loader import CreateDataLoader_TEST\nfrom models.models import create_model\n\ndataset_root = \"/phoenix/S6/zl548/\"\ntest_list_dir_l = dataset_root + '/MegaDpeth_code/test_list/landscape/'\ninput_height = 240\ninput_width = 320\ntest_data_loader_l = CreateDataLoader_TEST(dataset_root, test_list_dir_l, input_height, input_width)\ntest_dataset_l = test_data_loader_l.load_data()\ntest_dataset_size_l = len(test_data_loader_l)\nprint('========================= test L images = %d' % test_dataset_size_l)\n\ntest_list_dir_p = dataset_root + '/MegaDpeth_code/test_list/portrait/'\ninput_height = 320\ninput_width = 240\ntest_data_loader_p = CreateDataLoader_TEST(dataset_root, test_list_dir_p, input_height, input_width)\ntest_dataset_p = test_data_loader_p.load_data()\ntest_dataset_size_p = len(test_data_loader_p)\nprint('========================= test P images = %d' % test_dataset_size_p)\n\nmodel = create_model(opt)\n\nbatch_size = 32\ndiw_index = 0 \ntotal_steps = 0\nbest_loss = 100\n\nerror_list = [0 , 0, 0]\ntotal_list = [0 , 0, 0]\n\nlist_l = range(test_dataset_size_l)\nlist_p = range(test_dataset_size_p)\n\n\ndef test_SDR(model):\n    total_loss =0 \n    # count = 0\n    print(\"============================= TEST SDR============================\")\n    model.switch_to_eval()\n    diw_index = 0\n\n    for i, data in enumerate(test_dataset_l):\n        stacked_img = data['img_1']\n        targets = data['target_1']    \n        error, samples = model.evaluate_SDR(stacked_img, targets)\n\n        for j in range(0,3):\n            error_list[j] += error[j]\n            total_list[j] += samples[j]\n\n        print(\"EQUAL  \", error_list[0]/float(total_list[0]))\n        print(\"INEQUAL    \", error_list[1]/float(total_list[1]))\n        print(\"TOTAL    \",error_list[2]/float(total_list[2]))\n\n    for i, data in enumerate(test_dataset_p):\n        stacked_img = data['img_1']\n        targets = data['target_1']    \n\n        error, samples = model.evaluate_SDR(stacked_img, targets)\n\n        for j in range(0,3):\n            error_list[j] += error[j]\n            total_list[j] += samples[j]\n\n        print(\"EQUAL  \", error_list[0]/float(total_list[0]))\n        print(\"INEQUAL    \", error_list[1]/float(total_list[1]))\n        print(\"TOTAL    \",error_list[2]/float(total_list[2]))\n\n\n    print(\"=========================================================SDR Summary =====================\")\n    print(\"Equal SDR:\\t\" , float(error_list[0])/ float(total_list[0]))\n    print(\"Unequal SDR:\\t\" , float(error_list[1])/ float(total_list[1]))\n    print(\"SDR:\\t\" , float(error_list[2])/ float(total_list[2]))\n\n\nprint(\"WE ARE TESTING SDR!!!!\")\ntest_SDR(model)\n"
  },
  {
    "path": "MegaDepth/__init__.py",
    "content": "from .MegaDepth_model import    *\n"
  },
  {
    "path": "MegaDepth/data/__init__.py",
    "content": ""
  },
  {
    "path": "MegaDepth/data/aligned_data_loader.py",
    "content": "import random\nimport numpy as np\nimport torch.utils.data\nfrom data.base_data_loader import BaseDataLoader\nfrom data.image_folder import ImageFolder\nfrom data.image_folder import ImageFolder_TEST\nfrom builtins import object\nimport sys\nimport h5py\n\n\nclass PairedData(object):\n    def __init__(self, data_loader, flip):\n        self.data_loader = data_loader\n        # self.fineSize = fineSize\n        # self.max_dataset_size = max_dataset_size\n        self.flip = flip\n        self.data_loader_iter = iter(self.data_loader)\n        self.iter = 0\n    \n\n    def __iter__(self):\n        self.data_loader_iter = iter(self.data_loader)\n        self.iter = 0\n        return self\n\n    def __next__(self):\n        self.iter += 1\n\n        final_img, target_1 = next(self.data_loader_iter)\n\n        return {'img_1': final_img, 'target_1': target_1}\n\n\nclass AlignedDataLoader(BaseDataLoader):\n    def __init__(self,_root, _list_dir, _input_height, _input_width, _is_flip, _shuffle):\n        transform = None\n        dataset = ImageFolder(root=_root, \\\n                list_dir =_list_dir, input_height = _input_height, input_width = _input_width, transform=transform, is_flip = _is_flip)\n\n        data_loader = torch.utils.data.DataLoader(dataset, batch_size= 16, shuffle= _shuffle, num_workers=int(3))\n\n        self.dataset = dataset\n        flip = False\n        self.paired_data = PairedData(data_loader, flip)\n\n    def name(self):\n        return 'RMSEDataLoader'\n\n    def load_data(self):\n        return self.paired_data\n\n    def __len__(self):\n        return len(self.dataset)\n\n\n\nclass AlignedDataLoader_TEST(BaseDataLoader):\n    def __init__(self,_root, _list_dir, _input_height, _input_width):\n\n        dataset = ImageFolder_TEST(root=_root, \\\n                list_dir =_list_dir, _input_height = _input_height, _input_width = _input_width)\n\n        data_loader = torch.utils.data.DataLoader(dataset, batch_size= 1, shuffle= False, num_workers=int(3))\n        self.dataset = dataset\n        flip = False\n        self.paired_data = PairedData(data_loader, flip)\n\n    def name(self):\n        return 'TestSDRDataLoader'\n\n    def load_data(self):\n        return self.paired_data\n\n\n    def __len__(self):\n        return len(self.dataset)\n"
  },
  {
    "path": "MegaDepth/data/base_data_loader.py",
    "content": "\nclass BaseDataLoader():\n    def __init__(self):\n        pass\n    \n    # def initialize(self):\n    #     # self.opt = opt\n    #     pass\n\n    def load_data():\n        return None\n\n        \n        \n"
  },
  {
    "path": "MegaDepth/data/data_loader.py",
    "content": "\ndef CreateDataLoader(_root, _list_dir, _input_height, _input_width, is_flip = True, shuffle =  True):\n    data_loader = None\n    from data.aligned_data_loader import AlignedDataLoader\n    data_loader = AlignedDataLoader(_root, _list_dir, _input_height, _input_width, is_flip, shuffle)\n    return data_loader\n\ndef CreateDataLoader_TEST(_root, _list_dir, _input_height, _input_width):\n    data_loader = None\n    from data.aligned_data_loader import AlignedDataLoader_TEST\n    data_loader = AlignedDataLoader_TEST(_root, _list_dir, _input_height, _input_width)\n\n    return data_loader\n"
  },
  {
    "path": "MegaDepth/data/image_folder.py",
    "content": "################################################################################\n# Code from\n# https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py\n# Modified the original code so that it also loads images from the current\n# directory as well as the subdirectories\n################################################################################\nimport h5py\nimport torch.utils.data as data\nimport pickle\nimport numpy as np\nimport torch\nimport os, os.path\nimport math, random\nimport sys\nfrom skimage.transform import resize\nfrom skimage import io\n\n\n\ndef make_dataset(list_dir):\n    # subgroup_name1 = \"/dataset/image_list/\"\n    file_name = list_dir + \"imgs_MD.p\"\n    file_name_1 = open( file_name, \"rb\" )\n    images_list = pickle.load( file_name_1)\n    file_name_1.close()\n\n    file_name_t= list_dir + \"targets_MD.p\"\n    file_name_2 = open( file_name_t, \"rb\" )\n    targets_list = pickle.load(file_name_2)\n    file_name_2.close()\n    return images_list, targets_list\n\n# test for si-RMSE\nclass ImageFolder(data.Dataset):\n\n    def __init__(self, root, list_dir, input_height, input_width, transform=None, \n                 loader=None, is_flip = True):\n        # load image list from hdf5\n        img_list , targets_list = make_dataset(list_dir)\n        if len(img_list) == 0:\n            raise(RuntimeError(\"Found 0 images in: \" + root + \"\\n\"\n                               \"Supported image extensions are: \" + \",\".join(IMG_EXTENSIONS)))\n        # img_list_1, img_list_2 = selfshuffle_dataset(img_list)\n        self.root = root\n        self.list_dir = list_dir\n        self.img_list = img_list\n        self.targets_list = targets_list\n        self.transform = transform\n        # self.loader = loader\n        self.input_height = input_height\n        self.input_width = input_width\n        self.is_flip = is_flip\n\n\n    def load_MD(self, img_path, depth_path):\n\n        MD_img = np.float32(io.imread(img_path))/255.0\n\n        hdf5_file_read = h5py.File(depth_path,'r')\n        gt = hdf5_file_read.get('/depth')\n        gt = np.array(gt)\n\n        assert(gt.shape[0] == MD_img.shape[0])\n        assert(gt.shape[1] == MD_img.shape[1])\n\n        color_rgb = np.zeros((self.input_height,self.input_width,3))\n        MD_img = resize(MD_img, (self.input_height, self.input_width), order = 1)\n\n        if len(MD_img.shape) == 2:\n            color_rgb[:,:,0] = MD_img.copy()\n            color_rgb[:,:,1] = MD_img.copy()\n            color_rgb[:,:,2] = MD_img.copy()\n        else:\n            color_rgb = MD_img.copy()\n\n        if np.sum(gt > 1e-8) > 10:\n            gt[ gt > np.percentile(gt[gt > 1e-8], 98)] = 0\n            gt[ gt < np.percentile(gt[gt > 1e-8], 1)] = 0\n\n        max_depth = np.max(gt) + 1e-9\n        gt = gt/max_depth\n        gt = resize(gt, (self.input_height, self.input_width), order = 0)\n        gt = gt*max_depth\n\n        mask = np.float32(gt > 1e-8)\n\n        color_rgb = np.ascontiguousarray(color_rgb)\n        gt = np.ascontiguousarray(gt)\n        mask = np.ascontiguousarray(mask)\n\n        hdf5_file_read.close()\n\n        return color_rgb, gt, mask\n\n    def __getitem__(self, index):\n        # 00xx/1/\n        targets_1 = {}\n        # targets_1['L'] = []\n        targets_1['path'] = []\n\n        img_path_suff = self.img_list[index]\n        targets_path_suff = self.targets_list[index]\n\n        img_path = self.root + \"/MegaDepth_v1/\" + img_path_suff\n        depth_path = self.root + \"/MegaDepth_v1/\" + targets_path_suff\n\n        img, gt, mask = self.load_MD(img_path, depth_path)\n        \n        gt[mask < 0.1] = 1.0\n\n        targets_1['path'] = targets_path_suff\n        targets_1['gt_0'] = torch.from_numpy(gt).float()\n        targets_1['mask_0'] = torch.from_numpy(mask).float()\n\n        final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float()\n\n        return final_img, targets_1\n\n    def __len__(self):\n        return len(self.img_list)\n\n\n#  Test for SDR \nclass ImageFolder_TEST(data.Dataset):\n\n    def __init__(self, root, list_dir, _input_height, _input_width):\n        # load image list from hdf5\n        img_list , targets_list = make_dataset(list_dir)\n        if len(img_list) == 0:\n            raise(RuntimeError(\"Found 0 images in: \" + root + \"\\n\"\n                               \"Supported image extensions are: \" + \",\".join(IMG_EXTENSIONS)))\n        self.root = root\n        self.list_dir = list_dir\n        self.img_list = img_list\n        self.input_height = _input_height\n        self.input_width = _input_width\n        self.half_window = 1\n\n    def load_SfM_ORD(self, img_path, targets_path):\n\n        sfm_image = np.float32(io.imread(img_path))/255.0\n        resized_sfm_img = resize(sfm_image, (self.input_height, self.input_width), order = 1)\n\n        color_rgb = np.zeros((self.input_height, self.input_width,3))\n\n        if len(sfm_image.shape) == 2:\n            color_rgb[:,:,0] = resized_sfm_img.copy()\n            color_rgb[:,:,1] = resized_sfm_img.copy()\n            color_rgb[:,:,2] = resized_sfm_img.copy()\n        else:\n            color_rgb = resized_sfm_img.copy()\n\n        if color_rgb.shape[2] == 4:\n            return color_rgb, 0, 0 ,0, 0, 0\n\n        hdf5_file_read = h5py.File(targets_path,'r')\n        gt = hdf5_file_read.get('/SfM_features')\n        gt = np.array(gt)\n\n        y_A = np.round( gt[0,:] * float(self.input_height) )\n        x_A = np.round( gt[1,:] * float(self.input_width) )\n        y_B = np.round( gt[2,:] * float(self.input_height) )\n        x_B = np.round( gt[3,:] * float(self.input_width) )\n        ord_ = gt[4,:]\n\n        hdf5_file_read.close()\n\n        return color_rgb, y_A, x_A ,y_B, x_B, ord_\n\n    def __getitem__(self, index):\n        # 00xx/1/\n        targets_1 = {}\n        # targets_1['L'] = []\n        targets_1['path'] = []\n        targets_1['sdr_xA'] = []\n        targets_1['sdr_yA'] = []\n        targets_1['sdr_xB'] = []\n        targets_1['sdr_yB'] = []\n        targets_1['sdr_gt'] = []\n\n        img_path_suff = self.img_list[index]\n        img_path = self.root + \"/MegaDepth_v1/\" + img_path_suff\n        folder_name = img_path_suff.split('/')[-4]\n        img_name = img_path_suff.split('/')[-1]\n        sparse_sift_path = self.root + \"/sparse_features/\" + folder_name + \"/\" + img_name + \".h5\"\n\n        # no sift features\n        if not os.path.isfile(sparse_sift_path) or not os.path.isfile(img_path):\n\n            img = np.zeros((self.input_height, self.input_width,3))\n            targets_1['has_SfM_feature'] = False\n\n        else:\n\n            img, y_A, x_A ,y_B, x_B, ordinal = self.load_SfM_ORD(img_path, sparse_sift_path)\n\n            targets_1['sdr_xA'].append(torch.from_numpy(x_A).long())\n            targets_1['sdr_yA'].append(torch.from_numpy(y_A).long())\n            targets_1['sdr_xB'].append(torch.from_numpy(x_B).long())\n            targets_1['sdr_yB'].append(torch.from_numpy(y_B).long())\n            targets_1['sdr_gt'].append(torch.from_numpy(ordinal).float())\n            targets_1['has_SfM_feature'] = True\n\n        final_img = torch.from_numpy( np.transpose(img, (2,0,1)) ).contiguous().float()\n\n\n        return final_img, targets_1\n\n\n\n    def __len__(self):\n        return len(self.img_list)\n\n\n\n"
  },
  {
    "path": "MegaDepth/models/HG_model.py",
    "content": "import numpy as np\nimport torch\nimport os\nfrom torch.autograd import Variable\nfrom .base_model import BaseModel\nimport sys\n# import pytorch_DIW_scratch\nimport MegaDepth.pytorch_DIW_scratch as pytorch_DIW_scratch\n\nclass HGModel(BaseModel):\n    def name(self):\n        return 'HGModel'\n\n    def __init__(self, opt,pretrained=None):\n        BaseModel.initialize(self, opt)\n\n        # print(\"===========================================LOADING Hourglass NETWORK====================================================\")\n        model = pytorch_DIW_scratch.pytorch_DIW_scratch\n        # model_temp = model\n        # model= torch.nn.parallel.DataParallel(model, device_ids = [0,1])\n        # model_parameters = self.load_network(model, 'G', 'best_vanila')\n        if pretrained is None:\n            # model_parameters = self.load_network(model, 'G', 'best_generalization')\n            #\n            # model.load_state_dict(model_parameters)\n            # self.netG = model.cuda()\n            self.netG    = model\n            # print(\"No weights loaded for Hourglass Network\")\n        else:\n            pretrained_dict = torch.load(pretrained)\n\n            model_dict = model.state_dict()\n            # print(len(pretrained_dict))\n            # print(len(model_dict))\n            # 1. filter out unnecessary keys\n            # the saved model contains a 'module.' prefix for the data.parallel reason\n            pretrained_dict = {k[7:]: v for k, v in pretrained_dict.items()}  # and not k[:10]== 'rectifyNet'}\n            # print(str(len(pretrained_dict)) + \" are updated\")\n            # 2. overwrite entries in the existing state dict\n            model_dict.update(pretrained_dict)\n            # 3. load the new state dict\n            model.load_state_dict(model_dict)\n            pretrained_dict = None\n            self.netG = model\n\n\n\n    def batch_classify(self, z_A_arr, z_B_arr, ground_truth ):\n        threashold = 1.1\n        depth_ratio = torch.div(z_A_arr, z_B_arr)\n\n        depth_ratio = depth_ratio.cpu()\n\n        estimated_labels = torch.zeros(depth_ratio.size(0))\n\n        estimated_labels[depth_ratio > (threashold)] = 1\n        estimated_labels[depth_ratio < (1/threashold)] = -1\n\n        diff = estimated_labels - ground_truth\n        diff[diff != 0] = 1\n\n        # error \n        inequal_error_count = diff[ground_truth != 0]\n        inequal_error_count =  torch.sum(inequal_error_count)\n\n        error_count = torch.sum(diff) #diff[diff !=0]\n        # error_count = error_count.size(0)\n\n        equal_error_count = error_count - inequal_error_count\n\n\n        # total \n        total_count = depth_ratio.size(0)\n        ground_truth[ground_truth !=0 ] = 1\n\n        inequal_count_total = torch.sum(ground_truth)\n        equal_total_count = total_count - inequal_count_total\n\n\n        error_list = [equal_error_count, inequal_error_count, error_count]\n        count_list = [equal_total_count, inequal_count_total, total_count]\n\n        return error_list, count_list \n\n\n    def computeSDR(self, prediction_d, targets):\n        #  for each image \n        total_error = [0,0,0]\n        total_samples = [0,0,0]\n\n        for i in range(0, prediction_d.size(0)):\n\n            if targets['has_SfM_feature'][i] == False:\n                continue\n            \n            x_A_arr = targets[\"sdr_xA\"][i].squeeze(0)\n            x_B_arr = targets[\"sdr_xB\"][i].squeeze(0)\n            y_A_arr = targets[\"sdr_yA\"][i].squeeze(0)\n            y_B_arr = targets[\"sdr_yB\"][i].squeeze(0)\n\n            predict_depth = torch.exp(prediction_d[i,:,:])\n            predict_depth = predict_depth.squeeze(0)\n            ground_truth = targets[\"sdr_gt\"][i]\n\n            # print(x_A_arr.size())\n            # print(y_A_arr.size())\n\n            z_A_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_A_arr.cuda()) , 0, y_A_arr.view(1, -1).cuda())# predict_depth:index(2, x_A_arr):gather(1, y_A_arr:view(1, -1))\n            z_B_arr = torch.gather( torch.index_select(predict_depth, 1 ,x_B_arr.cuda()) , 0, y_B_arr.view(1, -1).cuda())\n\n            z_A_arr = z_A_arr.squeeze(0)\n            z_B_arr = z_B_arr.squeeze(0)\n\n            error_list, count_list  = self.batch_classify(z_A_arr, z_B_arr,ground_truth)\n\n            for j in range(0,3):\n                total_error[j] += error_list[j]\n                total_samples[j] += count_list[j]\n\n        return  total_error, total_samples\n\n\n    def evaluate_SDR(self, input_, targets):\n        input_images = Variable(input_.cuda() )\n        prediction_d = self.netG.forward(input_images) \n\n        total_error, total_samples = self.computeSDR(prediction_d.data, targets)\n\n        return total_error, total_samples\n\n    def rmse_Loss(self, log_prediction_d, mask, log_gt):\n        N = torch.sum(mask)\n        log_d_diff = log_prediction_d - log_gt\n        log_d_diff = torch.mul(log_d_diff, mask)\n        s1 = torch.sum( torch.pow(log_d_diff,2) )/N \n\n        s2 = torch.pow(torch.sum(log_d_diff),2)/(N*N)  \n        data_loss = s1 - s2\n\n        data_loss = torch.sqrt(data_loss)\n\n        return data_loss\n\n    def evaluate_RMSE(self, input_images, prediction_d, targets):\n        count = 0            \n        total_loss = Variable(torch.cuda.FloatTensor(1))\n        total_loss[0] = 0\n        mask_0 = Variable(targets['mask_0'].cuda(), requires_grad = False)\n        d_gt_0 = torch.log(Variable(targets['gt_0'].cuda(), requires_grad = False))\n\n        for i in range(0, mask_0.size(0)):\n \n            total_loss +=  self.rmse_Loss(prediction_d[i,:,:], mask_0[i,:,:], d_gt_0[i,:,:])\n            count += 1\n\n        return total_loss.data[0], count\n\n\n    def evaluate_sc_inv(self, input_, targets):\n        input_images = Variable(input_.cuda() )\n        prediction_d = self.netG.forward(input_images) \n        rmse_loss , count= self.evaluate_RMSE(input_images, prediction_d, targets)\n\n        return rmse_loss, count\n\n\n    def switch_to_train(self):\n        self.netG.train()\n\n    def switch_to_eval(self):\n        self.netG.eval()\n\n"
  },
  {
    "path": "MegaDepth/models/__init__.py",
    "content": ""
  },
  {
    "path": "MegaDepth/models/base_model.py",
    "content": "import os\nimport torch\n\nclass BaseModel():\n    def name(self):\n        return 'BaseModel'\n\n    def initialize(self, opt):\n        self.opt = opt\n        self.gpu_ids = opt.gpu_ids\n        self.isTrain = opt.isTrain\n        self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor\n        self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)\n\n    def set_input(self, input):\n        self.input = input\n\n    def forward(self):\n        pass\n\n    # used in test time, no backprop\n    def test(self):\n        pass\n\n    def get_image_paths(self):\n        pass\n\n    def optimize_parameters(self):\n        pass\n\n    def get_current_visuals(self):\n        return self.input\n\n    def get_current_errors(self):\n        return {}\n\n    def save(self, label):\n        pass\n\n    # helper saving function that can be used by subclasses\n    def save_network(self, network, network_label, epoch_label, gpu_ids):\n        save_filename = '_%s_net_%s.pth' % (epoch_label, network_label)\n        save_path = os.path.join(self.save_dir, save_filename)\n        torch.save(network.cpu().state_dict(), save_path)\n        if len(gpu_ids) and torch.cuda.is_available():\n            network.cuda(device_id=gpu_ids[0])\n\n    # helper loading function that can be used by subclasses\n    def load_network(self, network, network_label, epoch_label):\n        save_filename = '%s_net_%s.pth' % (epoch_label, network_label)\n        save_path = os.path.join(self.save_dir, save_filename)\n        print(save_path)\n        model = torch.load(save_path)\n        return model\n        # network.load_state_dict(torch.load(save_path))\n\n    def update_learning_rate():\n        pass\n"
  },
  {
    "path": "MegaDepth/models/models.py",
    "content": "\ndef create_model(opt,pretrained=None):\n    model = None\n    from .HG_model import HGModel\n    model = HGModel(opt,pretrained)\n    # print(\"model [%s] was created\" % (model.name()))\n    return model\n"
  },
  {
    "path": "MegaDepth/options/__init__.py",
    "content": ""
  },
  {
    "path": "MegaDepth/options/base_options.py",
    "content": "import argparse\nimport os\nfrom ..util import util\n\nclass BaseOptions():\n    def __init__(self):\n        self.parser = argparse.ArgumentParser()\n        self.initialized = False\n\n    def initialize(self):\n        # self.parser.add_argument('--dataroot', required=True, help='path to images (should have subfolders trainA, trainB, valA, valB, etc)')\n        self.parser.add_argument('--batchSize', type=int, default=1, help='input batch size')\n        self.parser.add_argument('--loadSize', type=int, default=286, help='scale images to this size')\n        self.parser.add_argument('--fineSize', type=int, default=256, help='then crop to this size')\n        self.parser.add_argument('--input_nc', type=int, default=3, help='# of input image channels')\n        self.parser.add_argument('--output_nc', type=int, default=3, help='# of output image channels')\n        self.parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in first conv layer')\n        self.parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in first conv layer')\n        # self.parser.add_argument('--which_model_netD', type=str, default='basic', help='selects model to use for netD')\n        self.parser.add_argument('--which_model_netG', type=str, default='unet_256', help='selects model to use for netG')\n        # self.parser.add_argument('--n_layers_D', type=int, default=3, help='only used if which_model_netD==n_layers')\n        self.parser.add_argument('--gpu_ids', type=str, default='0,1', help='gpu ids: e.g. 0  0,1,2, 0,2')\n        self.parser.add_argument('--name', type=str, default='test_local', help='name of the experiment. It decides where to store samples and models')\n        # self.parser.add_argument('--align_data', action='store_true',\n                                # help='if True, the datasets are loaded from \"test\" and \"train\" directories and the data pairs are aligned')\n        self.parser.add_argument('--model', type=str, default='pix2pix',\n                                 help='chooses which model to use. cycle_gan, one_direction_test, pix2pix, ...')\n        # self.parser.add_argument('--which_direction', type=str, default='AtoB', help='AtoB or BtoA')\n        self.parser.add_argument('--nThreads', default=2, type=int, help='# threads for loading data')\n        self.parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints/', help='models are saved here')\n        self.parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization')\n        self.parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')\n        self.parser.add_argument('--display_winsize', type=int, default=256,  help='display window size')\n        self.parser.add_argument('--display_id', type=int, default=1, help='window id of the web display')\n        self.parser.add_argument('--identity', type=float, default=0.0, help='use identity mapping. Setting identity other than 1 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set optidentity = 0.1')\n        self.parser.add_argument('--use_dropout', action='store_true', help='use dropout for the generator')\n        self.parser.add_argument('--max_dataset_size', type=int, default=float(\"inf\"), help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')\n\n        self.initialized = True\n\n    def parse(self):\n        if not self.initialized:\n            self.initialize()\n        self.opt = self.parser.parse_known_args()[0] #parse_args()\n        self.opt.isTrain = self.isTrain   # train or test\n\n        str_ids = self.opt.gpu_ids.split(',')\n        self.opt.gpu_ids = []\n        for str_id in str_ids:\n            id = int(str_id)\n            if id >= 0:\n                self.opt.gpu_ids.append(id)\n\n        args = vars(self.opt)\n\n        # print('------------ Options -------------')\n        # for k, v in sorted(args.items()):\n        #     print('%s: %s' % (str(k), str(v)))\n        # print('-------------- End ----------------')\n\n        # save to the disk\n        expr_dir =  os.path.join(self.opt.checkpoints_dir, self.opt.name)\n        util.mkdirs(expr_dir)\n        file_name = os.path.join(expr_dir, 'opt.txt')\n        with open(file_name, 'wt') as opt_file:\n            opt_file.write('------------ Options -------------\\n')\n            for k, v in sorted(args.items()):\n                opt_file.write('%s: %s\\n' % (str(k), str(v)))\n            opt_file.write('-------------- End ----------------\\n')\n        return self.opt\n"
  },
  {
    "path": "MegaDepth/options/test_options.py",
    "content": "from .base_options import BaseOptions\n\nclass TestOptions(BaseOptions):\n    def initialize(self):\n        BaseOptions.initialize(self)\n        self.parser.add_argument('--ntest', type=int, default=float(\"inf\"), help='# of test examples.')\n        self.parser.add_argument('--results_dir', type=str, default='./results/', help='saves results here.')\n        self.parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images')\n        self.parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')\n        self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')\n        self.parser.add_argument('--how_many', type=int, default=50, help='how many test images to run')\n        self.isTrain = False\n"
  },
  {
    "path": "MegaDepth/options/train_options.py",
    "content": "from .base_options import BaseOptions\n\nclass TrainOptions(BaseOptions):\n    def initialize(self):\n        BaseOptions.initialize(self)\n        self.parser.add_argument('--display_freq', type=int, default=100, help='frequency of showing training results on screen')\n        self.parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console')\n        self.parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results')\n        self.parser.add_argument('--save_epoch_freq', type=int, default=5, help='frequency of saving checkpoints at the end of epochs')\n        self.parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model')\n        self.parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc')\n        self.parser.add_argument('--which_epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')\n        self.parser.add_argument('--niter', type=int, default=100, help='# of iter at starting learning rate')\n        self.parser.add_argument('--niter_decay', type=int, default=100, help='# of iter to linearly decay learning rate to zero')\n        self.parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam')\n        self.parser.add_argument('--lr', type=float, default=0.0002, help='initial learning rate for adam')\n        self.parser.add_argument('--no_lsgan', action='store_true', help='do *not* use least square GAN, if false, use vanilla GAN')\n        self.parser.add_argument('--lambda_A', type=float, default=10.0, help='weight for cycle loss (A -> B -> A)')\n        self.parser.add_argument('--lambda_B', type=float, default=10.0, help='weight for cycle loss (B -> A -> B)')\n        self.parser.add_argument('--pool_size', type=int, default=50, help='the size of image buffer that stores previously generated images')\n        self.parser.add_argument('--no_html', action='store_true', help='do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/')\n        self.parser.add_argument('--no_flip'  , action='store_true', help='if specified, do not flip the images for data argumentation')\n\n        # NOT-IMPLEMENTED self.parser.add_argument('--preprocessing', type=str, default='resize_and_crop', help='resizing/cropping strategy')\n        self.isTrain = True\n"
  },
  {
    "path": "MegaDepth/pytorch_DIW_scratch.py",
    "content": "\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nfrom functools import reduce\n\nclass LambdaBase(nn.Sequential):\n    def __init__(self, fn, *args):\n        super(LambdaBase, self).__init__(*args)\n        self.lambda_func = fn\n\n    def forward_prepare(self, input):\n        output = []\n        for module in self._modules.values():\n            output.append(module(input))\n        return output if output else input\n\nclass Lambda(LambdaBase):\n    def forward(self, input):\n        return self.lambda_func(self.forward_prepare(input))\n\nclass LambdaMap(LambdaBase):\n    def forward(self, input):\n        return list(map(self.lambda_func,self.forward_prepare(input)))\n\nclass LambdaReduce(LambdaBase):\n    def forward(self, input):\n        return reduce(self.lambda_func,self.forward_prepare(input))\n\n\npytorch_DIW_scratch = nn.Sequential( # Sequential,\n\tnn.Conv2d(3,128,(7, 7),(1, 1),(3, 3)),\n\tnn.BatchNorm2d(128),\n\tnn.ReLU(),\n\tnn.Sequential( # Sequential,\n\t\tLambdaMap(lambda x: x, # ConcatTable,\n\t\t\tnn.Sequential( # Sequential,\n\t\t\t\tnn.MaxPool2d((2, 2),(2, 2)),\n\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\tLambdaMap(lambda x: x, # ConcatTable,\n\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\tnn.MaxPool2d((2, 2),(2, 2)),\n\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\tLambdaMap(lambda x: x, # ConcatTable,\n\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\tnn.AvgPool2d((2, 2),(2, 2)),\n\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\tLambdaMap(lambda x: x, # ConcatTable,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\tnn.AvgPool2d((2, 2),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t\t\tnn.UpsamplingNearest2d(scale_factor=2),\n\t\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y: x+y), # CAddTable,\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(64,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(64,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.Conv2d(64,64,(11, 11),(1, 1),(5, 5)),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t\t\tnn.UpsamplingNearest2d(scale_factor=2),\n\t\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tLambdaReduce(lambda x,y: x+y), # CAddTable,\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,64,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,64,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(256,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\tnn.UpsamplingNearest2d(scale_factor=2),\n\t\t\t\t\t\t),\n\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(32,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t\tnn.Conv2d(64,32,(11, 11),(1, 1),(5, 5)),\n\t\t\t\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\t\t\t),\n\t\t\t\t\t\t\t),\n\t\t\t\t\t\t),\n\t\t\t\t\t),\n\t\t\t\t\tLambdaReduce(lambda x,y: x+y), # CAddTable,\n\t\t\t\t),\n\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(64,32,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(64,32,(5, 5),(1, 1),(2, 2)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(64,32,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,16,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,16,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,16,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,32,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(32,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(32,16,(11, 11),(1, 1),(5, 5)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t\tnn.UpsamplingNearest2d(scale_factor=2),\n\t\t\t),\n\t\t\tnn.Sequential( # Sequential,\n\t\t\t\tLambdaReduce(lambda x,y,dim=1: torch.cat((x,y),dim), # Concat,\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,16,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(64,16,(3, 3),(1, 1),(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(64,16,(7, 7),(1, 1),(3, 3)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t\tnn.Sequential( # Sequential,\n\t\t\t\t\t\tnn.Conv2d(128,64,(1, 1)),\n\t\t\t\t\t\tnn.BatchNorm2d(64,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t\tnn.Conv2d(64,16,(11, 11),(1, 1),(5, 5)),\n\t\t\t\t\t\tnn.BatchNorm2d(16,1e-05,0.1,False),\n\t\t\t\t\t\tnn.ReLU(),\n\t\t\t\t\t),\n\t\t\t\t),\n\t\t\t),\n\t\t),\n\t\tLambdaReduce(lambda x,y: x+y), # CAddTable,\n\t),\n\tnn.Conv2d(64,1,(3, 3),(1, 1),(1, 1)),\n)"
  },
  {
    "path": "MegaDepth/rmse_error_main.py",
    "content": "import time\nimport torch\nimport sys\n\nfrom options.train_options import TrainOptions\nopt = TrainOptions().parse()  # set CUDA_VISIBLE_DEVICES before import torch\nfrom data.data_loader import CreateDataLoader\nfrom models.models import create_model\n\ndataset_root = \"/phoenix/S6/zl548/\"\ntest_list_dir_l = '/phoenix/S6/zl548/MegaDpeth_code/test_list/landscape/'\ninput_height = 240\ninput_width = 320\nis_flipped = False\nshuffle = False\n\ntest_data_loader_l = CreateDataLoader(dataset_root, test_list_dir_l, input_height, input_width, is_flipped, shuffle)\ntest_dataset_l = test_data_loader_l.load_data()\ntest_dataset_size_l = len(test_data_loader_l)\nprint('========================= test images = %d' % test_dataset_size_l)\ntest_list_dir_p = '/phoenix/S6/zl548/MegaDpeth_code/test_list/portrait/'\ninput_height = 320\ninput_width = 240\ntest_data_loader_p = CreateDataLoader(dataset_root, test_list_dir_p, input_height, input_width, is_flipped, shuffle)\ntest_dataset_p = test_data_loader_p.load_data()\ntest_dataset_size_p = len(test_data_loader_p)\nprint('========================= test images = %d' % test_dataset_size_p)\n\n\nmodel = create_model(opt)\n\n\ndef test(model):\n    total_loss =0 \n    toal_count = 0\n    print(\"============================= TEST ============================\")\n    model.switch_to_eval()\n    for i, data in enumerate(test_dataset_l):\n        stacked_img = data['img_1']\n        targets = data['target_1']    \n\n        rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets)\n\n        total_loss += rmse_loss\n        toal_count += count\n\n        print('RMSE loss is', total_loss/float(toal_count))\n\n    for i, data in enumerate(test_dataset_p):\n        stacked_img = data['img_1']\n        targets = data['target_1']    \n        rmse_loss , count = model.evaluate_sc_inv(stacked_img, targets)\n\n        total_loss += rmse_loss\n        toal_count += count\n\n        print('RMSE loss is', total_loss/float(toal_count))\n\n\n    print('average RMSE loss is', total_loss/float(toal_count))\n\nprint(\"WE ARE IN TESTING RMSE!!!!\")\ntest(model)\nprint(\"WE ARE DONE TESTING!!!\")\n\n\nprint(\"We are done\")\n"
  },
  {
    "path": "MegaDepth/util/__init__.py",
    "content": ""
  },
  {
    "path": "MegaDepth/util/html.py",
    "content": "import dominate\nfrom dominate.tags import *\nimport os\n\n\nclass HTML:\n    def __init__(self, web_dir, title, reflesh=0):\n        self.title = title\n        self.web_dir = web_dir\n        self.img_dir = os.path.join(self.web_dir, 'images')\n        if not os.path.exists(self.web_dir):\n            os.makedirs(self.web_dir)\n        if not os.path.exists(self.img_dir):\n            os.makedirs(self.img_dir)\n        # print(self.img_dir)\n\n        self.doc = dominate.document(title=title)\n        if reflesh > 0:\n            with self.doc.head:\n                meta(http_equiv=\"reflesh\", content=str(reflesh))\n\n    def get_image_dir(self):\n        return self.img_dir\n\n    def add_header(self, str):\n        with self.doc:\n            h3(str)\n\n    def add_table(self, border=1):\n        self.t = table(border=border, style=\"table-layout: fixed;\")\n        self.doc.add(self.t)\n\n    def add_images(self, ims, txts, links, width=400):\n        self.add_table()\n        with self.t:\n            with tr():\n                for im, txt, link in zip(ims, txts, links):\n                    with td(style=\"word-wrap: break-word;\", halign=\"center\", valign=\"top\"):\n                        with p():\n                            with a(href=os.path.join('images', link)):\n                                img(style=\"width:%dpx\" % width, src=os.path.join('images', im))\n                            br()\n                            p(txt)\n\n    def save(self):\n        html_file = '%s/index.html' % self.web_dir\n        f = open(html_file, 'wt')\n        f.write(self.doc.render())\n        f.close()\n\n\nif __name__ == '__main__':\n    html = HTML('web/', 'test_html')\n    html.add_header('hello world')\n\n    ims = []\n    txts = []\n    links = []\n    for n in range(4):\n        ims.append('image_%d.png' % n)\n        txts.append('text_%d' % n)\n        links.append('image_%d.png' % n)\n    html.add_images(ims, txts, links)\n    html.save()\n"
  },
  {
    "path": "MegaDepth/util/image_pool.py",
    "content": "import random\nimport numpy as np\nimport torch\nfrom pdb import set_trace as st\nfrom torch.autograd import Variable\nclass ImagePool():\n    def __init__(self, pool_size):\n        self.pool_size = pool_size\n        if self.pool_size > 0:\n            self.num_imgs = 0\n            self.images = []\n\n    def query(self, images):\n        if self.pool_size == 0:\n            return images\n        return_images = []\n        for image in images.data:\n            image = torch.unsqueeze(image, 0)\n            if self.num_imgs < self.pool_size:\n                self.num_imgs = self.num_imgs + 1\n                self.images.append(image)\n                return_images.append(image)\n            else:\n                p = random.uniform(0, 1)\n                if p > 0.5:\n                    random_id = random.randint(0, self.pool_size-1)\n                    tmp = self.images[random_id].clone()\n                    self.images[random_id] = image\n                    return_images.append(tmp)\n                else:\n                    return_images.append(image)\n        return_images = Variable(torch.cat(return_images, 0))\n        return return_images\n"
  },
  {
    "path": "MegaDepth/util/png.py",
    "content": "import struct\nimport zlib\n\ndef encode(buf, width, height):\n  \"\"\" buf: must be bytes or a bytearray in py3, a regular string in py2. formatted RGBRGB... \"\"\"\n  assert (width * height * 3 == len(buf))\n  bpp = 3\n\n  def raw_data():\n    # reverse the vertical line order and add null bytes at the start\n    row_bytes = width * bpp\n    for row_start in range((height - 1) * width * bpp, -1, -row_bytes):\n      yield b'\\x00'\n      yield buf[row_start:row_start + row_bytes]\n\n  def chunk(tag, data):\n    return [\n        struct.pack(\"!I\", len(data)),\n        tag,\n        data,\n        struct.pack(\"!I\", 0xFFFFFFFF & zlib.crc32(data, zlib.crc32(tag)))\n      ]\n\n  SIGNATURE = b'\\x89PNG\\r\\n\\x1a\\n'\n  COLOR_TYPE_RGB = 2\n  COLOR_TYPE_RGBA = 6\n  bit_depth = 8\n  return b''.join(\n      [ SIGNATURE ] +\n      chunk(b'IHDR', struct.pack(\"!2I5B\", width, height, bit_depth, COLOR_TYPE_RGB, 0, 0, 0)) +\n      chunk(b'IDAT', zlib.compress(b''.join(raw_data()), 9)) +\n      chunk(b'IEND', b'')\n    )\n"
  },
  {
    "path": "MegaDepth/util/util.py",
    "content": "from __future__ import print_function\nimport torch\nimport numpy as np\nfrom PIL import Image\nimport inspect, re\nimport numpy as np\nimport os\nimport collections\n\n# Converts a Tensor into a Numpy array\n# |imtype|: the desired type of the converted numpy array\ndef tensor2im(image_tensor, imtype=np.uint8):\n    image_numpy = image_tensor[0].cpu().float().numpy()\n    image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0\n    return image_numpy.astype(imtype)\n\n\ndef diagnose_network(net, name='network'):\n    mean = 0.0\n    count = 0\n    for param in net.parameters():\n        if param.grad is not None:\n            mean += torch.mean(torch.abs(param.grad.data))\n            count += 1\n    if count > 0:\n        mean = mean / count\n    print(name)\n    print(mean)\n\n\ndef save_image(image_numpy, image_path):\n    image_pil = Image.fromarray(image_numpy)\n    image_pil.save(image_path)\n\ndef info(object, spacing=10, collapse=1):\n    \"\"\"Print methods and doc strings.\n    Takes module, class, list, dictionary, or string.\"\"\"\n    methodList = [e for e in dir(object) if isinstance(getattr(object, e), collections.Callable)]\n    processFunc = collapse and (lambda s: \" \".join(s.split())) or (lambda s: s)\n    print( \"\\n\".join([\"%s %s\" %\n                     (method.ljust(spacing),\n                      processFunc(str(getattr(object, method).__doc__)))\n                     for method in methodList]) )\n\ndef varname(p):\n    for line in inspect.getframeinfo(inspect.currentframe().f_back)[3]:\n        m = re.search(r'\\bvarname\\s*\\(\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*\\)', line)\n        if m:\n            return m.group(1)\n\ndef print_numpy(x, val=True, shp=False):\n    x = x.astype(np.float64)\n    if shp:\n        print('shape,', x.shape)\n    if val:\n        x = x.flatten()\n        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (\n            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))\n\n\ndef mkdirs(paths):\n    if isinstance(paths, list) and not isinstance(paths, str):\n        for path in paths:\n            mkdir(path)\n    else:\n        mkdir(paths)\n\n\ndef mkdir(path):\n    if not os.path.exists(path):\n        os.makedirs(path)\n"
  },
  {
    "path": "MegaDepth/util/visualizer.py",
    "content": "import numpy as np\nimport os\nimport ntpath\nimport time\nfrom . import util\nfrom . import html\n\nclass Visualizer():\n    def __init__(self, opt):\n        # self.opt = opt\n        self.display_id = opt.display_id\n        self.use_html = opt.isTrain and not opt.no_html\n        self.win_size = opt.display_winsize\n        self.name = opt.name\n        if self.display_id > 0:\n            import visdom\n            self.vis = visdom.Visdom()\n\n        if self.use_html:\n            self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web')\n            self.img_dir = os.path.join(self.web_dir, 'images')\n            print('create web directory %s...' % self.web_dir)\n            util.mkdirs([self.web_dir, self.img_dir])\n\n\n    # |visuals|: dictionary of images to display or save\n    def display_current_results(self, visuals, epoch):\n        if self.display_id > 0: # show images in the browser\n            idx = 1\n            for label, image_numpy in visuals.items():\n                #image_numpy = np.flipud(image_numpy)\n                self.vis.image(image_numpy.transpose([2,0,1]), opts=dict(title=label),\n                                   win=self.display_id + idx)\n                idx += 1\n\n        if self.use_html: # save images to a html file\n            for label, image_numpy in visuals.items():\n                img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label))\n                util.save_image(image_numpy, img_path)\n            # update website\n            webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, reflesh=1)\n            for n in range(epoch, 0, -1):\n                webpage.add_header('epoch [%d]' % n)\n                ims = []\n                txts = []\n                links = []\n\n                for label, image_numpy in visuals.items():\n                    img_path = 'epoch%.3d_%s.png' % (n, label)\n                    ims.append(img_path)\n                    txts.append(label)\n                    links.append(img_path)\n                webpage.add_images(ims, txts, links, width=self.win_size)\n            webpage.save()\n\n    # errors: dictionary of error labels and values\n    def plot_current_errors(self, epoch, counter_ratio, opt, errors):\n        if not hasattr(self, 'plot_data'):\n            self.plot_data = {'X':[],'Y':[], 'legend':list(errors.keys())}\n        self.plot_data['X'].append(epoch + counter_ratio)\n        self.plot_data['Y'].append([errors[k] for k in self.plot_data['legend']])\n        self.vis.line(\n            X=np.stack([np.array(self.plot_data['X'])]*len(self.plot_data['legend']),1),\n            Y=np.array(self.plot_data['Y']),\n            opts={\n                'title': self.name + ' loss over time',\n                'legend': self.plot_data['legend'],\n                'xlabel': 'epoch',\n                'ylabel': 'loss'},\n            win=self.display_id)\n\n    # errors: same format as |errors| of plotCurrentErrors\n    def print_current_errors(self, epoch, i, errors, t):\n        message = '(epoch: %d, iters: %d, time: %.3f) ' % (epoch, i, t)\n        for k, v in errors.items():\n            message += '%s: %.3f ' % (k, v)\n\n        print(message)\n\n    # save image to the disk\n    def save_images(self, webpage, visuals, image_path):\n        image_dir = webpage.get_image_dir()\n        short_path = ntpath.basename(image_path[0])\n        name = os.path.splitext(short_path)[0]\n\n        webpage.add_header(name)\n        ims = []\n        txts = []\n        links = []\n\n        for label, image_numpy in visuals.items():\n            image_name = '%s_%s.png' % (name, label)\n            save_path = os.path.join(image_dir, image_name)\n            util.save_image(image_numpy, save_path)\n\n            ims.append(image_name)\n            txts.append(label)\n            links.append(image_name)\n        webpage.add_images(ims, txts, links, width=self.win_size)\n"
  },
  {
    "path": "PWCNet/PWCNet.py",
    "content": "\"\"\"\nimplementation of the PWC-DC network for optical flow estimation by Sun et al., 2018\n\nJinwei Gu and Zhile Ren\n\n\"\"\"\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nimport os\nos.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory \n#from .correlation_package.modules.corr import Correlation\n# from PWCNet.correlation_package_pytorch0_4.correlation import Correlation #pytorch0.4 version\nfrom PWCNet.correlation_package_pytorch1_0.correlation import Correlation #pytorch0.4 version\n\nimport numpy as np\n\n\n\n\n\n__all__ = [\n    'pwc_dc_net', 'pwc_dc_net_old'\n    ]\n\ndef conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):   \n    return nn.Sequential(\n            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, \n                        padding=padding, dilation=dilation, bias=True),\n            nn.LeakyReLU(0.1))\n\ndef predict_flow(in_planes):\n    return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)\n\ndef deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):\n    return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True)\n\nimport time\n\nclass PWCDCNet(nn.Module):\n    \"\"\"\n    PWC-DC net. add dilation convolution and densenet connections\n\n    \"\"\"\n    def __init__(self, md=4):\n        \"\"\"\n        input: md --- maximum displacement (for correlation. default: 4), after warpping\n\n        \"\"\"\n        super(PWCDCNet,self).__init__()\n\n        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)\n        self.conv1aa = conv(16,  16, kernel_size=3, stride=1)\n        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)\n        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)\n        self.conv2aa = conv(32,  32, kernel_size=3, stride=1)\n        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)\n        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)\n        self.conv3aa = conv(64,  64, kernel_size=3, stride=1)\n        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)\n        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)\n        self.conv4aa = conv(96,  96, kernel_size=3, stride=1)\n        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)\n        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)\n        self.conv5aa = conv(128,128, kernel_size=3, stride=1)\n        self.conv5b  = conv(128,128, kernel_size=3, stride=1)\n        self.conv6aa = conv(128,196, kernel_size=3, stride=2)\n        self.conv6a  = conv(196,196, kernel_size=3, stride=1)\n        self.conv6b  = conv(196,196, kernel_size=3, stride=1)\n\n        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)\n        self.leakyRELU = nn.LeakyReLU(0.1)\n        \n        nd = (2*md+1)**2\n        dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int)\n        dd = [int(d) for d in dd]\n\n        od = nd\n        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        \n        self.predict_flow6 = predict_flow(od+dd[4])\n        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+128+4\n        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow5 = predict_flow(od+dd[4]) \n        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+96+4\n        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow4 = predict_flow(od+dd[4]) \n        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+64+4\n        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow3 = predict_flow(od+dd[4]) \n        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+32+4\n        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow2 = predict_flow(od+dd[4]) \n        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        \n        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)\n        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)\n        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)\n        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)\n        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv7 = predict_flow(32)\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):\n                nn.init.kaiming_normal_(m.weight.data, mode='fan_in')\n                if m.bias is not None:\n                    m.bias.data.zero_()\n\n        W_MAX = 2048\n        H_MAX = 1024\n        B_MAX = 3\n        xx = torch.arange(0, W_MAX).view(1,-1).cuda().repeat(H_MAX,1)\n        yy = torch.arange(0, H_MAX).view(-1,1).cuda().repeat(1,W_MAX)\n        xx = xx.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1)\n        yy = yy.view(1,1,H_MAX,W_MAX).repeat(B_MAX,1,1,1)\n        grid = torch.cat((xx,yy),1).float()\n\n        ## for saving time on allocating a grid in forward\n        self.W_MAX = W_MAX\n        self.H_MAX = H_MAX\n        self.B_MAX = B_MAX\n        self.grid = Variable(grid, requires_grad=False)\n        # self.mask_base = Variable(torch.cuda.FloatTensor().resize_(B_MAX,).zero_() + 1)\n\n\n    def warp(self, x, flo):\n        \"\"\"\n        warp an image/tensor (im2) back to im1, according to the optical flow\n\n        x: [B, C, H, W] (im2)\n        flo: [B, 2, H, W] flow\n\n        \"\"\"\n        B, C, H, W = x.size()\n        # mesh grid \n        # xx = torch.arange(0, W).view(1,-1).cuda().repeat(H,1)\n        # yy = torch.arange(0, H).view(-1,1).cuda().repeat(1,W)\n        # xx = xx.view(1,1,H,W).repeat(B,1,1,1)\n        # yy = yy.view(1,1,H,W).repeat(B,1,1,1)\n        # grid = torch.cat((xx,yy),1).float()\n\n        # # if x.is_cuda:\n        # #     grid = grid.cuda()\n        # vgrid = Variable(grid) + flo\n        assert(B <= self.B_MAX and H <= self.H_MAX and W <= self.W_MAX)\n        vgrid = self.grid[:B,:,:H,:W] +flo\n\n        # scale grid to [-1,1] \n        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone()/max(W-1,1)-1.0\n        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone()/max(H-1,1)-1.0\n\n\n        vgrid = vgrid.permute(0,2,3,1)        \n        output = nn.functional.grid_sample(x, vgrid)\n        # mask = torch.autograd.Variable(torch.ones(x.size())).cuda()\n        mask = torch.autograd.Variable(torch.cuda.FloatTensor().resize_(x.size()).zero_() + 1, requires_grad = False)\n        mask = nn.functional.grid_sample(mask, vgrid)\n\n        # if W==128:\n            # np.save('mask.npy', mask.cpu().data.numpy())\n            # np.save('warp.npy', output.cpu().data.numpy())\n        \n        mask[mask<0.9999] = 0\n        mask[mask>0] = 1\n        \n        return output*mask\n\n\n    def forward(self,x, output_more = False):\n        im1 = x[:,:3,:,:]\n        im2 = x[:,3:,:,:]\n        # print(\"\\n\\n***************************PWC Net details *************** \\n\\n\")\n        # start=  time.time()\n        c11 = self.conv1b(self.conv1aa(self.conv1a(im1)))\n        c21 = self.conv1b(self.conv1aa(self.conv1a(im2)))\n        c12 = self.conv2b(self.conv2aa(self.conv2a(c11)))\n        c22 = self.conv2b(self.conv2aa(self.conv2a(c21)))\n        c13 = self.conv3b(self.conv3aa(self.conv3a(c12)))\n        c23 = self.conv3b(self.conv3aa(self.conv3a(c22)))\n        c14 = self.conv4b(self.conv4aa(self.conv4a(c13)))\n        c24 = self.conv4b(self.conv4aa(self.conv4a(c23)))\n        c15 = self.conv5b(self.conv5aa(self.conv5a(c14)))\n        c25 = self.conv5b(self.conv5aa(self.conv5a(c24)))\n        c16 = self.conv6b(self.conv6a(self.conv6aa(c15)))\n        c26 = self.conv6b(self.conv6a(self.conv6aa(c25)))\n        # print(\"features \" +str(time.time()- start))\n        # start=  time.time()\n        corr6 = self.corr(c16, c26) \n        corr6 = self.leakyRELU(corr6)   \n\n\n        x = torch.cat((self.conv6_0(corr6), corr6),1)\n        x = torch.cat((self.conv6_1(x), x),1)\n        x = torch.cat((self.conv6_2(x), x),1)\n        x = torch.cat((self.conv6_3(x), x),1)\n        x = torch.cat((self.conv6_4(x), x),1)\n        flow6 = self.predict_flow6(x)\n        up_flow6 = self.deconv6(flow6)\n        up_feat6 = self.upfeat6(x)\n        # print(\"level6 \" +str(time.time()- start))\n        # start=  time.time()\n        \n        warp5 = self.warp(c25, up_flow6*0.625)\n        # print(\"level5_1 \" + str(time.time() - start))\n        # start5 = time.time()\n        corr5 = self.corr(c15, warp5)\n        # print(\"level5_2 \" + str(time.time() - start5))\n        # start5 = time.time()\n        corr5 = self.leakyRELU(corr5)\n\n        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)\n        x = torch.cat((self.conv5_0(x), x),1)\n        x = torch.cat((self.conv5_1(x), x),1)\n        x = torch.cat((self.conv5_2(x), x),1)\n        x = torch.cat((self.conv5_3(x), x),1)\n        x = torch.cat((self.conv5_4(x), x),1)\n\n        flow5 = self.predict_flow5(x)\n        up_flow5 = self.deconv5(flow5)\n        up_feat5 = self.upfeat5(x)\n        # print(\"level5_3 \" + str(time.time() - start5))\n        # print(\"level5 \" + str(time.time() - start))\n        # start = time.time()\n\n        warp4 = self.warp(c24, up_flow5*1.25)\n        corr4 = self.corr(c14, warp4)  \n        corr4 = self.leakyRELU(corr4)\n        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)\n        x = torch.cat((self.conv4_0(x), x),1)\n        x = torch.cat((self.conv4_1(x), x),1)\n        x = torch.cat((self.conv4_2(x), x),1)\n        x = torch.cat((self.conv4_3(x), x),1)\n        x = torch.cat((self.conv4_4(x), x),1)\n        flow4 = self.predict_flow4(x)\n        up_flow4 = self.deconv4(flow4)\n        up_feat4 = self.upfeat4(x)\n\n        # print(\"level4 \" + str(time.time() - start))\n        # start = time.time()\n\n        warp3 = self.warp(c23, up_flow4*2.5)\n        corr3 = self.corr(c13, warp3) \n        corr3 = self.leakyRELU(corr3)\n        \n\n        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)\n        x = torch.cat((self.conv3_0(x), x),1)\n        x = torch.cat((self.conv3_1(x), x),1)\n        x = torch.cat((self.conv3_2(x), x),1)\n        x = torch.cat((self.conv3_3(x), x),1)\n        x = torch.cat((self.conv3_4(x), x),1)\n        flow3 = self.predict_flow3(x)\n        up_flow3 = self.deconv3(flow3)\n        up_feat3 = self.upfeat3(x)\n\n        # print(\"level3 \" + str(time.time() - start))\n        # start = time.time()\n\n        warp2 = self.warp(c22, up_flow3*5.0) \n        corr2 = self.corr(c12, warp2)\n        corr2 = self.leakyRELU(corr2)\n        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)\n        x = torch.cat((self.conv2_0(x), x),1)\n        x = torch.cat((self.conv2_1(x), x),1)\n        x = torch.cat((self.conv2_2(x), x),1)\n        x = torch.cat((self.conv2_3(x), x),1)\n        x = torch.cat((self.conv2_4(x), x),1)\n        flow2 = self.predict_flow2(x)\n        # print(\"level2 \" + str(time.time() - start))\n        # start = time.time()\n\n        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))\n        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))\n        # print(\"refine \" + str(time.time() - start))\n        # start = time.time()\n\n        # we don't have the gt for flow, we just fine tune it on flownets\n        if not output_more:\n            return flow2\n        else:\n            return [flow2,flow3,flow4,flow5,flow6]\n        # if self.training:\n        #     return flow2,flow3,flow4,flow5,flow6\n        # else:\n        #     return flow2\n\n\n\nclass PWCDCNet_old(nn.Module):\n    \"\"\"\n    PWC-DC net. add dilation convolution and densenet connections\n\n    \"\"\"\n    def __init__(self, md=4):\n        \"\"\"\n        input: md --- maximum displacement (for correlation. default: 4), after warpping\n\n        \"\"\"\n        super(PWCDCNet_old,self).__init__()\n\n        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)\n        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)\n        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)\n        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)\n        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)\n        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)\n        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)\n        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)\n        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)\n        self.conv5b  = conv(128,128, kernel_size=3, stride=1)\n        self.conv6a  = conv(128,196, kernel_size=3, stride=2)\n        self.conv6b  = conv(196,196, kernel_size=3, stride=1)\n\n        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)\n        self.leakyRELU = nn.LeakyReLU(0.1)\n        \n        nd = (2*md+1)**2\n        dd = np.cumsum([128,128,96,64,32])\n\n        od = nd\n        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        \n        self.predict_flow6 = predict_flow(od+dd[4])\n        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+128+4\n        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow5 = predict_flow(od+dd[4]) \n        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+96+4\n        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow4 = predict_flow(od+dd[4]) \n        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+64+4\n        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow3 = predict_flow(od+dd[4]) \n        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+32+4\n        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow2 = predict_flow(od+dd[4]) \n        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        \n        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)\n        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)\n        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)\n        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)\n        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv7 = predict_flow(32)\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):\n                nn.init.kaiming_normal(m.weight.data, mode='fan_in')\n                if m.bias is not None:\n                    m.bias.data.zero_()\n\n\n    def warp(self, x, flo):\n        \"\"\"\n        warp an image/tensor (im2) back to im1, according to the optical flow\n\n        x: [B, C, H, W] (im2)\n        flo: [B, 2, H, W] flow\n\n        \"\"\"\n        B, C, H, W = x.size()\n        # mesh grid \n        xx = torch.arange(0, W).view(1,-1).repeat(H,1)\n        yy = torch.arange(0, H).view(-1,1).repeat(1,W)\n        xx = xx.view(1,1,H,W).repeat(B,1,1,1)\n        yy = yy.view(1,1,H,W).repeat(B,1,1,1)\n        grid = torch.cat((xx,yy),1).float()\n\n        if x.is_cuda:\n            grid = grid.cuda()\n        vgrid = Variable(grid) + flo\n\n        # scale grid to [-1,1] \n        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0\n        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0\n\n        vgrid = vgrid.permute(0,2,3,1)        \n        output = nn.functional.grid_sample(x, vgrid)\n        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()\n        mask = nn.functional.grid_sample(mask, vgrid)\n        \n        mask[mask<0.999] = 0\n        mask[mask>0] = 1\n        \n        return output*mask\n\n\n    def forward(self,x):\n        im1 = x[:,:3,:,:]\n        im2 = x[:,3:,:,:]\n        \n        c11 = self.conv1b(self.conv1a(im1))\n        c21 = self.conv1b(self.conv1a(im2))\n        c12 = self.conv2b(self.conv2a(c11))\n        c22 = self.conv2b(self.conv2a(c21))\n        c13 = self.conv3b(self.conv3a(c12))\n        c23 = self.conv3b(self.conv3a(c22))\n        c14 = self.conv4b(self.conv4a(c13))\n        c24 = self.conv4b(self.conv4a(c23))        \n        c15 = self.conv5b(self.conv5a(c14))\n        c25 = self.conv5b(self.conv5a(c24))\n        c16 = self.conv6b(self.conv6a(c15))\n        c26 = self.conv6b(self.conv6a(c25))\n        \n        corr6 = self.corr(c16, c26) \n        corr6 = self.leakyRELU(corr6)        \n        x = torch.cat((corr6, self.conv6_0(corr6)),1)\n        x = torch.cat((self.conv6_1(x), x),1)\n        x = torch.cat((x, self.conv6_2(x)),1)\n        x = torch.cat((x, self.conv6_3(x)),1)\n        x = torch.cat((x, self.conv6_4(x)),1)\n        flow6 = self.predict_flow6(x)\n        up_flow6 = self.deconv6(flow6)\n        up_feat6 = self.upfeat6(x)\n        \n        warp5 = self.warp(c25, up_flow6*0.625)\n        corr5 = self.corr(c15, warp5) \n        corr5 = self.leakyRELU(corr5)\n        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)\n        x = torch.cat((x, self.conv5_0(x)),1)\n        x = torch.cat((self.conv5_1(x), x),1)\n        x = torch.cat((x, self.conv5_2(x)),1)\n        x = torch.cat((x, self.conv5_3(x)),1)\n        x = torch.cat((x, self.conv5_4(x)),1)\n        flow5 = self.predict_flow5(x)\n        up_flow5 = self.deconv5(flow5)\n        up_feat5 = self.upfeat5(x)\n        \n        warp4 = self.warp(c24, up_flow5*1.25)\n        corr4 = self.corr(c14, warp4)  \n        corr4 = self.leakyRELU(corr4)\n        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)\n        x = torch.cat((x, self.conv4_0(x)),1)\n        x = torch.cat((self.conv4_1(x), x),1)\n        x = torch.cat((x, self.conv4_2(x)),1)\n        x = torch.cat((x, self.conv4_3(x)),1)\n        x = torch.cat((x, self.conv4_4(x)),1)\n        flow4 = self.predict_flow4(x)\n        up_flow4 = self.deconv4(flow4)\n        up_feat4 = self.upfeat4(x)\n\n        warp3 = self.warp(c23, up_flow4*2.5)\n        corr3 = self.corr(c13, warp3) \n        corr3 = self.leakyRELU(corr3)\n        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)\n        x = torch.cat((x, self.conv3_0(x)),1)\n        x = torch.cat((self.conv3_1(x), x),1)\n        x = torch.cat((x, self.conv3_2(x)),1)\n        x = torch.cat((x, self.conv3_3(x)),1)\n        x = torch.cat((x, self.conv3_4(x)),1)\n        flow3 = self.predict_flow3(x)\n        up_flow3 = self.deconv3(flow3)\n        up_feat3 = self.upfeat3(x)\n        \n        warp2 = self.warp(c22, up_flow3*5.0) \n        corr2 = self.corr(c12, warp2)\n        corr2 = self.leakyRELU(corr2)\n        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)\n        x = torch.cat((x, self.conv2_0(x)),1)\n        x = torch.cat((self.conv2_1(x), x),1)\n        x = torch.cat((x, self.conv2_2(x)),1)\n        x = torch.cat((x, self.conv2_3(x)),1)\n        x = torch.cat((x, self.conv2_4(x)),1)\n        flow2 = self.predict_flow2(x)\n \n        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))\n        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))\n        \n        if self.training:\n            return flow2,flow3,flow4,flow5,flow6\n        else:\n            return flow2\n\n\n\n\n\ndef pwc_dc_net(path=None):\n\n    model = PWCDCNet()\n    if path is not None:\n        data = torch.load(path)\n        if 'state_dict' in data.keys():\n            model.load_state_dict(data['state_dict'])\n        else:\n            model.load_state_dict(data)\n    return model\n\n\n\n\ndef pwc_dc_net_old(path=None):\n\n    model = PWCDCNet_old()\n    if path is not None:\n        data = torch.load(path)\n        if 'state_dict' in data.keys():\n            model.load_state_dict(data['state_dict'])\n        else:\n            model.load_state_dict(data)\n    return model\n"
  },
  {
    "path": "PWCNet/__init__.py",
    "content": "from .PWCNet import *"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/__init__.py",
    "content": ""
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/build.sh",
    "content": "#!/usr/bin/env bash\n\necho \"Need pytorch>=1.0.0\"\nsource activate pytorch1.0.0\n\nexport PYTHONPATH=$PYTHONPATH:$(pwd)/../../my_package\n\nrm -rf build *.egg-info dist\npython setup.py install\n"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/clean.sh",
    "content": "#!/usr/bin/env bash\n\necho \"Need pytorch>=1.0.0\"\nsource activate pytorch1.0.0\n\n\nrm -rf build *.egg-info dist\n#python setup.py install\n"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/correlation.py",
    "content": "import torch\nfrom torch.nn.modules.module import Module\nfrom torch.autograd import Function\nimport correlation_cuda\n\nclass CorrelationFunction(Function):\n\n    def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):\n        super(CorrelationFunction, self).__init__()\n        self.pad_size = pad_size\n        self.kernel_size = kernel_size\n        self.max_displacement = max_displacement\n        self.stride1 = stride1\n        self.stride2 = stride2\n        self.corr_multiply = corr_multiply\n        # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1)\n\n    def forward(self, input1, input2):\n        self.save_for_backward(input1, input2)\n\n        with torch.cuda.device_of(input1):\n            rbot1 = input1.new()\n            rbot2 = input2.new()\n            output = input1.new()\n\n            correlation_cuda.forward(input1, input2, rbot1, rbot2, output, \n                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)\n\n        return output\n\n    def backward(self, grad_output):\n        input1, input2 = self.saved_tensors\n\n        with torch.cuda.device_of(input1):\n            rbot1 = input1.new()\n            rbot2 = input2.new()\n\n            grad_input1 = input1.new()\n            grad_input2 = input2.new()\n\n            correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,\n                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)\n\n        return grad_input1, grad_input2\n\n\nclass Correlation(Module):\n    def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):\n        super(Correlation, self).__init__()\n        self.pad_size = pad_size\n        self.kernel_size = kernel_size\n        self.max_displacement = max_displacement\n        self.stride1 = stride1\n        self.stride2 = stride2\n        self.corr_multiply = corr_multiply\n\n    def forward(self, input1, input2):\n\n        result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)(input1, input2)\n\n        return result\n\n"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/correlation_cuda.cc",
    "content": "#include <torch/torch.h>\n#include <ATen/ATen.h>\n#include <stdio.h>\n#include <iostream>\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\n#include \"correlation_cuda_kernel.cuh\"\n\nint correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output,\n                       int pad_size,\n                       int kernel_size,\n                       int max_displacement,\n                       int stride1,\n                       int stride2,\n                       int corr_type_multiply)\n{\n\n  int batchSize = input1.size(0);\n\n  int nInputChannels = input1.size(1);\n  int inputHeight = input1.size(2);\n  int inputWidth = input1.size(3);\n\n  int kernel_radius = (kernel_size - 1) / 2;\n  int border_radius = kernel_radius + max_displacement;\n\n  int paddedInputHeight = inputHeight + 2 * pad_size;\n  int paddedInputWidth = inputWidth + 2 * pad_size;\n\n  int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1);\n\n  int outputHeight = ceil(static_cast<float>(paddedInputHeight - 2 * border_radius) / static_cast<float>(stride1));\n  int outputwidth = ceil(static_cast<float>(paddedInputWidth - 2 * border_radius) / static_cast<float>(stride1));\n\n  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});\n  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});\n  output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth});\n\n  rInput1.fill_(0);\n  rInput2.fill_(0);\n  output.fill_(0);\n\n  int success = correlation_forward_cuda_kernel(\n    output,\n    output.size(0), \n    output.size(1),\n    output.size(2),\n    output.size(3),\n    output.stride(0),\n    output.stride(1),\n    output.stride(2),\n    output.stride(3),\n    input1,\n    input1.size(1),\n    input1.size(2),\n    input1.size(3),\n    input1.stride(0),\n    input1.stride(1),\n    input1.stride(2),\n    input1.stride(3),\n    input2,\n    input2.size(1),\n    input2.stride(0),\n    input2.stride(1),\n    input2.stride(2),\n    input2.stride(3),\n    rInput1,\n    rInput2,\n    pad_size,     \n    kernel_size,\n    max_displacement,\n    stride1,\n    stride2,\n    corr_type_multiply,\n//\t\t\tat::globalContext().getCurrentCUDAStream() //works for 0.4.1\n           at::cuda::getCurrentCUDAStream() //works for 1.0.0\n  );\n\n  //check for errors\n  if (!success) {\n    AT_ERROR(\"CUDA call failed\");\n  }\n\n  return 1;\n\n}\n\nint correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput, \n                       at::Tensor& gradInput1, at::Tensor& gradInput2,\n                       int pad_size,\n                       int kernel_size,\n                       int max_displacement,\n                       int stride1,\n                       int stride2,\n                       int corr_type_multiply)\n{\n\n  int batchSize = input1.size(0);\n  int nInputChannels = input1.size(1);\n  int paddedInputHeight = input1.size(2)+ 2 * pad_size;\n  int paddedInputWidth = input1.size(3)+ 2 * pad_size;\n\n  int height = input1.size(2);\n  int width = input1.size(3);\n\n  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});\n  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});\n  gradInput1.resize_({batchSize, nInputChannels, height, width});\n  gradInput2.resize_({batchSize, nInputChannels, height, width});\n\n  rInput1.fill_(0);\n  rInput2.fill_(0);\n  gradInput1.fill_(0);\n  gradInput2.fill_(0);\n\n  int success = correlation_backward_cuda_kernel(gradOutput,\n                                                gradOutput.size(0),\n                                                gradOutput.size(1),\n                                                gradOutput.size(2),\n                                                gradOutput.size(3),\n                                                gradOutput.stride(0),\n                                                gradOutput.stride(1),\n                                                gradOutput.stride(2),\n                                                gradOutput.stride(3),\n                                                input1,\n                                                input1.size(1),\n                                                input1.size(2),\n                                                input1.size(3),\n                                                input1.stride(0),\n                                                input1.stride(1),\n                                                input1.stride(2),\n                                                input1.stride(3),\n                                                input2,  \n                                                input2.stride(0),\n                                                input2.stride(1),\n                                                input2.stride(2),\n                                                input2.stride(3),\n                                                gradInput1,\n                                                gradInput1.stride(0),\n                                                gradInput1.stride(1),\n                                                gradInput1.stride(2),\n                                                gradInput1.stride(3),\n                                                gradInput2,\n                                                gradInput2.size(1),\n                                                gradInput2.stride(0),\n                                                gradInput2.stride(1),\n                                                gradInput2.stride(2),\n                                                gradInput2.stride(3),\n                                                rInput1,\n                                                rInput2,\n                                                pad_size,\n                                                kernel_size,\n                                                max_displacement,\n                                                stride1, \n                                                stride2,\n                                                corr_type_multiply,\n//\t\t\tat::globalContext().getCurrentCUDAStream() //works for 0.4.1\n           at::cuda::getCurrentCUDAStream() //works for 1.0.0\n                                               );\n\n  if (!success) {\n    AT_ERROR(\"CUDA call failed\");\n  }\n\n  return 1;\n}\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"forward\", &correlation_forward_cuda, \"Correlation forward (CUDA)\");\n  m.def(\"backward\", &correlation_backward_cuda, \"Correlation backward (CUDA)\");\n}\n\n"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cu",
    "content": "#include <stdio.h>\n\n#include \"correlation_cuda_kernel.cuh\"\n\n#define CUDA_NUM_THREADS 1024\n#define THREADS_PER_BLOCK 32\n#define FULL_MASK 0xffffffff\n\n#include <ATen/ATen.h>\n#include <ATen/NativeFunctions.h>\n#include <ATen/Dispatch.h>\n#include <ATen/cuda/CUDAApplyUtils.cuh>\n\nusing at::Half;\n\ntemplate<typename scalar_t>\n__forceinline__ __device__ scalar_t warpReduceSum(scalar_t val) {\n        for (int offset = 16; offset > 0; offset /= 2)\n                val += __shfl_down_sync(FULL_MASK, val, offset);\n        return val;\n}\n\ntemplate<typename scalar_t>\n__forceinline__ __device__ scalar_t blockReduceSum(scalar_t val) {\n\n        static __shared__ scalar_t shared[32];\n        int lane = threadIdx.x % warpSize;\n        int wid = threadIdx.x / warpSize;\n\n        val = warpReduceSum(val);\n\n        if (lane == 0)\n                shared[wid] = val;\n\n        __syncthreads();\n\n        val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;\n\n        if (wid == 0)\n                val = warpReduceSum(val);\n\n        return val;\n}\n\n\ntemplate <typename scalar_t>\n__global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size)\n{\n\n    // n (batch size), c (num of channels), y (height), x (width)\n    int n = blockIdx.x;\n    int y = blockIdx.y;\n    int x = blockIdx.z;\n\n    int ch_off = threadIdx.x;\n    scalar_t value;\n\n    int dimcyx = channels * height * width;\n    int dimyx = height * width;\n\n    int p_dimx = (width + 2 * pad_size);\n    int p_dimy = (height + 2 * pad_size);\n    int p_dimyxc = channels * p_dimy * p_dimx;\n    int p_dimxc = p_dimx * channels;\n\n    for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) {\n      value = input[n * dimcyx + c * dimyx + y * width + x];\n      rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value;\n    }\n}\n\n\ntemplate<typename scalar_t>\n__global__ void correlation_forward(scalar_t* __restrict__ output, const int nOutputChannels,\n                const int outputHeight, const int outputWidth, const scalar_t* __restrict__ rInput1,\n                const int nInputChannels, const int inputHeight, const int inputWidth,\n                const scalar_t* __restrict__ rInput2, const int pad_size, const int kernel_size,\n                const int max_displacement, const int stride1, const int stride2) {\n\n        int32_t pInputWidth = inputWidth + 2 * pad_size;\n        int32_t pInputHeight = inputHeight + 2 * pad_size;\n\n        int32_t kernel_rad = (kernel_size - 1) / 2;\n\n        int32_t displacement_rad = max_displacement / stride2;\n\n        int32_t displacement_size = 2 * displacement_rad + 1;\n\n        int32_t n = blockIdx.x;\n        int32_t y1 = blockIdx.y * stride1 + max_displacement;\n        int32_t x1 = blockIdx.z * stride1 + max_displacement;\n        int32_t c = threadIdx.x;\n\n        int32_t pdimyxc = pInputHeight * pInputWidth * nInputChannels;\n\n        int32_t pdimxc = pInputWidth * nInputChannels;\n\n        int32_t pdimc = nInputChannels;\n\n        int32_t tdimcyx = nOutputChannels * outputHeight * outputWidth;\n        int32_t tdimyx = outputHeight * outputWidth;\n        int32_t tdimx = outputWidth;\n\n        int32_t nelems = kernel_size * kernel_size * pdimc;\n\n        // element-wise product along channel axis\n        for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {\n                for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {\n                        int x2 = x1 + ti * stride2;\n                        int y2 = y1 + tj * stride2;\n\n                        float acc0 = 0.0f;\n\n                        for (int j = -kernel_rad; j <= kernel_rad; ++j) {\n                                for (int i = -kernel_rad; i <= kernel_rad; ++i) {\n                                        // THREADS_PER_BLOCK\n                                        #pragma unroll\n                                        for (int ch = c; ch < pdimc; ch += blockDim.x) {\n\n                                                int indx1 = n * pdimyxc + (y1 + j) * pdimxc\n                                                                + (x1 + i) * pdimc + ch;\n                                                int indx2 = n * pdimyxc + (y2 + j) * pdimxc\n                                                                + (x2 + i) * pdimc + ch;\n                                                acc0 += static_cast<float>(rInput1[indx1] * rInput2[indx2]);\n                                        }\n                                }\n                        }\n\n                        if (blockDim.x == warpSize) {\n                            __syncwarp();\n                            acc0 = warpReduceSum(acc0);\n                        } else {\n                            __syncthreads();\n                            acc0 = blockReduceSum(acc0);\n                        }\n\n                        if (threadIdx.x == 0) {\n\n                                int tc = (tj + displacement_rad) * displacement_size\n                                                + (ti + displacement_rad);\n                                const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx\n                                                + blockIdx.z;\n                                output[tindx] = static_cast<scalar_t>(acc0 / nelems);\n                        }\n            }\n        }\n}\n\n\ntemplate <typename scalar_t>\n__global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth, \n                                            const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth, \n                                            const scalar_t* __restrict__ rInput2, \n                                            int pad_size,\n                                            int kernel_size,\n                                            int max_displacement,\n                                            int stride1,\n                                            int stride2)\n  {\n    // n (batch size), c (num of channels), y (height), x (width)\n\n    int n = item; \n    int y = blockIdx.x * stride1 + pad_size;\n    int x = blockIdx.y * stride1 + pad_size;\n    int c = blockIdx.z;\n    int tch_off = threadIdx.x;\n\n    int kernel_rad = (kernel_size - 1) / 2;\n    int displacement_rad = max_displacement / stride2;\n    int displacement_size = 2 * displacement_rad + 1;\n\n    int xmin = (x - kernel_rad - max_displacement) / stride1;\n    int ymin = (y - kernel_rad - max_displacement) / stride1;\n\n    int xmax = (x + kernel_rad - max_displacement) / stride1;\n    int ymax = (y + kernel_rad - max_displacement) / stride1;\n\n    if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {\n        // assumes gradInput1 is pre-allocated and zero filled\n      return;\n    }\n\n    if (xmin > xmax || ymin > ymax) {\n        // assumes gradInput1 is pre-allocated and zero filled\n        return;\n    }\n\n    xmin = max(0,xmin);\n    xmax = min(outputWidth-1,xmax);\n\n    ymin = max(0,ymin);\n    ymax = min(outputHeight-1,ymax);\n\n    int pInputWidth = inputWidth + 2 * pad_size;\n    int pInputHeight = inputHeight + 2 * pad_size;\n\n    int pdimyxc = pInputHeight * pInputWidth * nInputChannels;\n    int pdimxc = pInputWidth * nInputChannels;\n    int pdimc = nInputChannels;\n\n    int tdimcyx = nOutputChannels * outputHeight * outputWidth;\n    int tdimyx = outputHeight * outputWidth;\n    int tdimx = outputWidth;\n\n    int odimcyx = nInputChannels * inputHeight* inputWidth;\n    int odimyx = inputHeight * inputWidth;\n    int odimx = inputWidth;\n\n    scalar_t nelems = kernel_size * kernel_size * nInputChannels;\n\n    __shared__ scalar_t prod_sum[THREADS_PER_BLOCK];\n    prod_sum[tch_off] = 0;\n\n    for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {\n\n      int i2 = (tc % displacement_size - displacement_rad) * stride2;\n      int j2 = (tc / displacement_size - displacement_rad) * stride2;\n\n      int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c;\n      \n      scalar_t val2 = rInput2[indx2];\n\n      for (int j = ymin; j <= ymax; ++j) {\n        for (int i = xmin; i <= xmax; ++i) {\n          int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;\n          prod_sum[tch_off] += gradOutput[tindx] * val2;\n        }\n      }\n    }\n    __syncthreads();\n\n    if(tch_off == 0) {\n      scalar_t reduce_sum = 0;\n      for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {\n          reduce_sum += prod_sum[idx];\n      }\n      const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);\n      gradInput1[indx1] = reduce_sum / nelems;\n    }\n\n}\n\ntemplate <typename scalar_t>\n__global__ void correlation_backward_input2(int item, scalar_t*  gradInput2, int nInputChannels, int inputHeight, int inputWidth,\n                                            const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,\n                                            const scalar_t* __restrict__ rInput1,\n                                            int pad_size,\n                                            int kernel_size,\n                                            int max_displacement,\n                                            int stride1,\n                                            int stride2)\n{\n    // n (batch size), c (num of channels), y (height), x (width)\n\n    int n = item;\n    int y = blockIdx.x * stride1 + pad_size;\n    int x = blockIdx.y * stride1 + pad_size;\n    int c = blockIdx.z;\n\n    int tch_off = threadIdx.x;\n\n    int kernel_rad = (kernel_size - 1) / 2;\n    int displacement_rad = max_displacement / stride2;\n    int displacement_size = 2 * displacement_rad + 1;\n\n    int pInputWidth = inputWidth + 2 * pad_size;\n    int pInputHeight = inputHeight + 2 * pad_size;\n\n    int pdimyxc = pInputHeight * pInputWidth * nInputChannels;\n    int pdimxc = pInputWidth * nInputChannels;\n    int pdimc = nInputChannels;\n\n    int tdimcyx = nOutputChannels * outputHeight * outputWidth;\n    int tdimyx = outputHeight * outputWidth;\n    int tdimx = outputWidth;\n\n    int odimcyx = nInputChannels * inputHeight* inputWidth;\n    int odimyx = inputHeight * inputWidth;\n    int odimx = inputWidth;\n\n    scalar_t nelems = kernel_size * kernel_size * nInputChannels;\n\n    __shared__ scalar_t prod_sum[THREADS_PER_BLOCK];\n    prod_sum[tch_off] = 0;\n\n    for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {\n      int i2 = (tc % displacement_size - displacement_rad) * stride2;\n      int j2 = (tc / displacement_size - displacement_rad) * stride2;\n\n      int xmin = (x - kernel_rad - max_displacement - i2) / stride1;\n      int ymin = (y - kernel_rad - max_displacement - j2) / stride1;\n\n      int xmax = (x + kernel_rad - max_displacement - i2) / stride1;\n      int ymax = (y + kernel_rad - max_displacement - j2) / stride1;\n\n      if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {\n          // assumes gradInput2 is pre-allocated and zero filled\n        continue;\n      }\n\n      if (xmin > xmax || ymin > ymax) {\n          // assumes gradInput2 is pre-allocated and zero filled\n          continue;\n      }\n\n      xmin = max(0,xmin);\n      xmax = min(outputWidth-1,xmax);\n\n      ymin = max(0,ymin);\n      ymax = min(outputHeight-1,ymax);\n      \n      int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c;\n      scalar_t val1 = rInput1[indx1];\n\n      for (int j = ymin; j <= ymax; ++j) {\n        for (int i = xmin; i <= xmax; ++i) {\n          int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;\n          prod_sum[tch_off] += gradOutput[tindx] * val1;\n        }\n      }\n    }\n\n    __syncthreads();\n\n    if(tch_off == 0) {\n      scalar_t reduce_sum = 0;\n      for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {\n          reduce_sum += prod_sum[idx];\n      }\n      const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);\n      gradInput2[indx2] = reduce_sum / nelems;\n    }\n\n}\n\nint correlation_forward_cuda_kernel(at::Tensor& output,\n                                    int ob,\n                                    int oc,\n                                    int oh,\n                                    int ow,\n                                    int osb,\n                                    int osc,\n                                    int osh,\n                                    int osw,\n\n                                    at::Tensor& input1,\n                                    int ic,\n                                    int ih,\n                                    int iw,\n                                    int isb,\n                                    int isc,\n                                    int ish,\n                                    int isw,\n\n                                    at::Tensor& input2,\n                                    int gc,\n                                    int gsb,\n                                    int gsc,\n                                    int gsh,\n                                    int gsw,\n\n                                    at::Tensor& rInput1,\n                                    at::Tensor& rInput2,\n                                    int pad_size,\n                                    int kernel_size,\n                                    int max_displacement,\n                                    int stride1,\n                                    int stride2,\n                                    int corr_type_multiply,\n                                    cudaStream_t stream) \n{\n\n   int batchSize = ob;\n\n   int nInputChannels = ic;\n   int inputWidth = iw;\n   int inputHeight = ih;\n\n   int nOutputChannels = oc;\n   int outputWidth = ow;\n   int outputHeight = oh;\n\n   dim3 blocks_grid(batchSize, inputHeight, inputWidth);\n   dim3 threads_block(THREADS_PER_BLOCK);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), \"channels_first_fwd_1\", ([&] {\n\n  channels_first<scalar_t><<<blocks_grid,threads_block, 0, stream>>>(\n      input1.data<scalar_t>(), rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);\n\n  }));\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), \"channels_first_fwd_2\", ([&] {\n\n  channels_first<scalar_t><<<blocks_grid,threads_block, 0, stream>>> (\n      input2.data<scalar_t>(), rInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);\n\n  }));\n\n   dim3 threadsPerBlock(THREADS_PER_BLOCK);\n   dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth);\n\n  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), \"correlation_forward\", ([&] {\n\n   correlation_forward<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>> \n                        (output.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,\n                         rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,\n                         rInput2.data<scalar_t>(),\n                         pad_size,\n                         kernel_size,\n                         max_displacement,\n                         stride1,\n                         stride2);\n\n  }));\n\n  cudaError_t err = cudaGetLastError();\n\n\n  // check for errors\n  if (err != cudaSuccess) {\n    printf(\"error in correlation_forward_cuda_kernel: %s\\n\", cudaGetErrorString(err));\n    return 0;\n  }\n\n  return 1;\n}\n\n\nint correlation_backward_cuda_kernel(\n                                    at::Tensor& gradOutput,\n                                    int gob,\n                                    int goc,\n                                    int goh,\n                                    int gow,\n                                    int gosb,\n                                    int gosc,\n                                    int gosh,\n                                    int gosw,\n\n                                    at::Tensor& input1,\n                                    int ic,\n                                    int ih,\n                                    int iw,\n                                    int isb,\n                                    int isc,\n                                    int ish,\n                                    int isw,\n\n                                    at::Tensor& input2,\n                                    int gsb,\n                                    int gsc,\n                                    int gsh,\n                                    int gsw,\n\n                                    at::Tensor& gradInput1,\n                                    int gisb,\n                                    int gisc,\n                                    int gish,\n                                    int gisw,\n\n                                    at::Tensor& gradInput2,\n                                    int ggc,\n                                    int ggsb,\n                                    int ggsc,\n                                    int ggsh,\n                                    int ggsw,\n\n                                    at::Tensor& rInput1,\n                                    at::Tensor& rInput2,\n                                    int pad_size,\n                                    int kernel_size,\n                                    int max_displacement,\n                                    int stride1,\n                                    int stride2,\n                                    int corr_type_multiply,\n                                    cudaStream_t stream)\n{\n\n    int batchSize = gob;\n    int num = batchSize;\n\n    int nInputChannels = ic;\n    int inputWidth = iw;\n    int inputHeight = ih;\n\n    int nOutputChannels = goc;\n    int outputWidth = gow;\n    int outputHeight = goh;\n\n    dim3 blocks_grid(batchSize, inputHeight, inputWidth);\n    dim3 threads_block(THREADS_PER_BLOCK);\n\n\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), \"lltm_forward_cuda\", ([&] {\n\n        channels_first<scalar_t><<<blocks_grid, threads_block, 0, stream>>>(\n            input1.data<scalar_t>(),\n            rInput1.data<scalar_t>(),\n            nInputChannels,\n            inputHeight,\n            inputWidth,\n            pad_size\n        );\n    }));\n\n    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), \"lltm_forward_cuda\", ([&] {\n\n        channels_first<scalar_t><<<blocks_grid, threads_block, 0, stream>>>(\n            input2.data<scalar_t>(),\n            rInput2.data<scalar_t>(),\n            nInputChannels,\n            inputHeight,\n            inputWidth,\n            pad_size\n        );\n    }));\n\n    dim3 threadsPerBlock(THREADS_PER_BLOCK);\n    dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels);\n\n    for (int n = 0; n < num; ++n) {\n\n      AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), \"lltm_forward_cuda\", ([&] {\n\n\n          correlation_backward_input1<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>> (\n              n, gradInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,\n              gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,\n              rInput2.data<scalar_t>(),\n              pad_size,\n              kernel_size,\n              max_displacement,\n              stride1,\n              stride2);\n      }));\n    }\n\n    for(int n = 0; n < batchSize; n++) {\n\n      AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), \"lltm_forward_cuda\", ([&] {\n\n        correlation_backward_input2<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>>(\n            n, gradInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,\n            gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,\n            rInput1.data<scalar_t>(),\n            pad_size,\n            kernel_size,\n            max_displacement,\n            stride1,\n            stride2);\n\n        }));\n    }\n\n  // check for errors\n  cudaError_t err = cudaGetLastError();\n  if (err != cudaSuccess) {\n    printf(\"error in correlation_backward_cuda_kernel: %s\\n\", cudaGetErrorString(err));\n    return 0;\n  }\n\n  return 1;\n}\n"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/correlation_cuda_kernel.cuh",
    "content": "#pragma once\n\n#include <ATen/ATen.h>\n#include <ATen/Context.h>\n#include <cuda_runtime.h>\n\nint correlation_forward_cuda_kernel(at::Tensor& output,\n    int ob,\n    int oc,\n    int oh,\n    int ow,\n    int osb,\n    int osc,\n    int osh,\n    int osw,\n\n    at::Tensor& input1,\n    int ic,\n    int ih,\n    int iw,\n    int isb,\n    int isc,\n    int ish,\n    int isw,\n\n    at::Tensor& input2,\n    int gc,\n    int gsb,\n    int gsc,\n    int gsh,\n    int gsw,\n\n    at::Tensor& rInput1,\n    at::Tensor& rInput2,\n    int pad_size,\n    int kernel_size,\n    int max_displacement,\n    int stride1,\n    int stride2,\n    int corr_type_multiply,\n    cudaStream_t stream);\n\n\nint correlation_backward_cuda_kernel(   \n    at::Tensor& gradOutput,\n    int gob,\n    int goc,\n    int goh,\n    int gow,\n    int gosb,\n    int gosc,\n    int gosh,\n    int gosw,\n\n    at::Tensor& input1,\n    int ic,\n    int ih,\n    int iw,\n    int isb,\n    int isc,\n    int ish,\n    int isw,\n\n    at::Tensor& input2,\n    int gsb,\n    int gsc,\n    int gsh,\n    int gsw,\n\n    at::Tensor& gradInput1, \n    int gisb,\n    int gisc,\n    int gish,\n    int gisw,\n\n    at::Tensor& gradInput2,\n    int ggc,\n    int ggsb,\n    int ggsc,\n    int ggsh,\n    int ggsw,\n\n    at::Tensor& rInput1,\n    at::Tensor& rInput2,\n    int pad_size,\n    int kernel_size,\n    int max_displacement,\n    int stride1,\n    int stride2,\n    int corr_type_multiply,\n    cudaStream_t stream);\n"
  },
  {
    "path": "PWCNet/correlation_package_pytorch1_0/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='correlation_cuda',\n    ext_modules=[\n        CUDAExtension('correlation_cuda', [\n            'correlation_cuda.cc',\n            'correlation_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "PWCNet/models/PWCNet.py",
    "content": "\"\"\"\nimplementation of the PWC-DC network for optical flow estimation by Sun et al., 2018\n\nJinwei Gu and Zhile Ren\n\n\"\"\"\n\nimport torch\nimport torch.nn as nn\nfrom torch.autograd import Variable\nimport os\nos.environ['PYTHON_EGG_CACHE'] = 'tmp/' # a writable directory \nfrom correlation_package.modules.corr import Correlation \nimport numpy as np\n\n\n\n\n\n__all__ = [\n    'pwc_dc_net', 'pwc_dc_net_old'\n    ]\n\ndef conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):   \n    return nn.Sequential(\n            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, \n                        padding=padding, dilation=dilation, bias=True),\n            nn.LeakyReLU(0.1))\n\ndef predict_flow(in_planes):\n    return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)\n\ndef deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):\n    return nn.ConvTranspose2d(in_planes, out_planes, kernel_size, stride, padding, bias=True)\n\n\n\nclass PWCDCNet(nn.Module):\n    \"\"\"\n    PWC-DC net. add dilation convolution and densenet connections\n\n    \"\"\"\n    def __init__(self, md=4):\n        \"\"\"\n        input: md --- maximum displacement (for correlation. default: 4), after warpping\n\n        \"\"\"\n        super(PWCDCNet,self).__init__()\n\n        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)\n        self.conv1aa = conv(16,  16, kernel_size=3, stride=1)\n        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)\n        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)\n        self.conv2aa = conv(32,  32, kernel_size=3, stride=1)\n        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)\n        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)\n        self.conv3aa = conv(64,  64, kernel_size=3, stride=1)\n        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)\n        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)\n        self.conv4aa = conv(96,  96, kernel_size=3, stride=1)\n        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)\n        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)\n        self.conv5aa = conv(128,128, kernel_size=3, stride=1)\n        self.conv5b  = conv(128,128, kernel_size=3, stride=1)\n        self.conv6aa = conv(128,196, kernel_size=3, stride=2)\n        self.conv6a  = conv(196,196, kernel_size=3, stride=1)\n        self.conv6b  = conv(196,196, kernel_size=3, stride=1)\n\n        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)\n        self.leakyRELU = nn.LeakyReLU(0.1)\n        \n        nd = (2*md+1)**2\n        dd = np.cumsum([128,128,96,64,32],dtype=np.int32).astype(np.int)\n        dd = [int(d) for d in dd]\n\n        od = nd\n        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        \n        self.predict_flow6 = predict_flow(od+dd[4])\n        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+128+4\n        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow5 = predict_flow(od+dd[4]) \n        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+96+4\n        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow4 = predict_flow(od+dd[4]) \n        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+64+4\n        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow3 = predict_flow(od+dd[4]) \n        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+32+4\n        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow2 = predict_flow(od+dd[4]) \n        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        \n        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)\n        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)\n        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)\n        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)\n        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv7 = predict_flow(32)\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):\n                nn.init.kaiming_normal(m.weight.data, mode='fan_in')\n                if m.bias is not None:\n                    m.bias.data.zero_()\n\n\n    def warp(self, x, flo):\n        \"\"\"\n        warp an image/tensor (im2) back to im1, according to the optical flow\n\n        x: [B, C, H, W] (im2)\n        flo: [B, 2, H, W] flow\n\n        \"\"\"\n        B, C, H, W = x.size()\n        # mesh grid \n        xx = torch.arange(0, W).view(1,-1).repeat(H,1)\n        yy = torch.arange(0, H).view(-1,1).repeat(1,W)\n        xx = xx.view(1,1,H,W).repeat(B,1,1,1)\n        yy = yy.view(1,1,H,W).repeat(B,1,1,1)\n        grid = torch.cat((xx,yy),1).float()\n\n        if x.is_cuda:\n            grid = grid.cuda()\n        vgrid = Variable(grid) + flo\n\n        # scale grid to [-1,1] \n        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0\n        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0\n\n        vgrid = vgrid.permute(0,2,3,1)        \n        output = nn.functional.grid_sample(x, vgrid)\n        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()\n        mask = nn.functional.grid_sample(mask, vgrid)\n\n        # if W==128:\n            # np.save('mask.npy', mask.cpu().data.numpy())\n            # np.save('warp.npy', output.cpu().data.numpy())\n        \n        mask[mask<0.9999] = 0\n        mask[mask>0] = 1\n        \n        return output*mask\n\n\n    def forward(self,x):\n        im1 = x[:,:3,:,:]\n        im2 = x[:,3:,:,:]\n        \n        c11 = self.conv1b(self.conv1aa(self.conv1a(im1)))\n        c21 = self.conv1b(self.conv1aa(self.conv1a(im2)))\n        c12 = self.conv2b(self.conv2aa(self.conv2a(c11)))\n        c22 = self.conv2b(self.conv2aa(self.conv2a(c21)))\n        c13 = self.conv3b(self.conv3aa(self.conv3a(c12)))\n        c23 = self.conv3b(self.conv3aa(self.conv3a(c22)))\n        c14 = self.conv4b(self.conv4aa(self.conv4a(c13)))\n        c24 = self.conv4b(self.conv4aa(self.conv4a(c23)))\n        c15 = self.conv5b(self.conv5aa(self.conv5a(c14)))\n        c25 = self.conv5b(self.conv5aa(self.conv5a(c24)))\n        c16 = self.conv6b(self.conv6a(self.conv6aa(c15)))\n        c26 = self.conv6b(self.conv6a(self.conv6aa(c25)))\n\n\n        corr6 = self.corr(c16, c26) \n        corr6 = self.leakyRELU(corr6)   \n\n\n        x = torch.cat((self.conv6_0(corr6), corr6),1)\n        x = torch.cat((self.conv6_1(x), x),1)\n        x = torch.cat((self.conv6_2(x), x),1)\n        x = torch.cat((self.conv6_3(x), x),1)\n        x = torch.cat((self.conv6_4(x), x),1)\n        flow6 = self.predict_flow6(x)\n        up_flow6 = self.deconv6(flow6)\n        up_feat6 = self.upfeat6(x)\n\n        \n        warp5 = self.warp(c25, up_flow6*0.625)\n        corr5 = self.corr(c15, warp5) \n        corr5 = self.leakyRELU(corr5)\n        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)\n        x = torch.cat((self.conv5_0(x), x),1)\n        x = torch.cat((self.conv5_1(x), x),1)\n        x = torch.cat((self.conv5_2(x), x),1)\n        x = torch.cat((self.conv5_3(x), x),1)\n        x = torch.cat((self.conv5_4(x), x),1)\n        flow5 = self.predict_flow5(x)\n        up_flow5 = self.deconv5(flow5)\n        up_feat5 = self.upfeat5(x)\n\n       \n        warp4 = self.warp(c24, up_flow5*1.25)\n        corr4 = self.corr(c14, warp4)  \n        corr4 = self.leakyRELU(corr4)\n        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)\n        x = torch.cat((self.conv4_0(x), x),1)\n        x = torch.cat((self.conv4_1(x), x),1)\n        x = torch.cat((self.conv4_2(x), x),1)\n        x = torch.cat((self.conv4_3(x), x),1)\n        x = torch.cat((self.conv4_4(x), x),1)\n        flow4 = self.predict_flow4(x)\n        up_flow4 = self.deconv4(flow4)\n        up_feat4 = self.upfeat4(x)\n\n\n        warp3 = self.warp(c23, up_flow4*2.5)\n        corr3 = self.corr(c13, warp3) \n        corr3 = self.leakyRELU(corr3)\n        \n\n        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)\n        x = torch.cat((self.conv3_0(x), x),1)\n        x = torch.cat((self.conv3_1(x), x),1)\n        x = torch.cat((self.conv3_2(x), x),1)\n        x = torch.cat((self.conv3_3(x), x),1)\n        x = torch.cat((self.conv3_4(x), x),1)\n        flow3 = self.predict_flow3(x)\n        up_flow3 = self.deconv3(flow3)\n        up_feat3 = self.upfeat3(x)\n\n\n        warp2 = self.warp(c22, up_flow3*5.0) \n        corr2 = self.corr(c12, warp2)\n        corr2 = self.leakyRELU(corr2)\n        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)\n        x = torch.cat((self.conv2_0(x), x),1)\n        x = torch.cat((self.conv2_1(x), x),1)\n        x = torch.cat((self.conv2_2(x), x),1)\n        x = torch.cat((self.conv2_3(x), x),1)\n        x = torch.cat((self.conv2_4(x), x),1)\n        flow2 = self.predict_flow2(x)\n \n        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))\n        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))\n        \n        if self.training:\n            return flow2,flow3,flow4,flow5,flow6\n        else:\n            return flow2\n\n\n\nclass PWCDCNet_old(nn.Module):\n    \"\"\"\n    PWC-DC net. add dilation convolution and densenet connections\n\n    \"\"\"\n    def __init__(self, md=4):\n        \"\"\"\n        input: md --- maximum displacement (for correlation. default: 4), after warpping\n\n        \"\"\"\n        super(PWCDCNet_old,self).__init__()\n\n        self.conv1a  = conv(3,   16, kernel_size=3, stride=2)\n        self.conv1b  = conv(16,  16, kernel_size=3, stride=1)\n        self.conv2a  = conv(16,  32, kernel_size=3, stride=2)\n        self.conv2b  = conv(32,  32, kernel_size=3, stride=1)\n        self.conv3a  = conv(32,  64, kernel_size=3, stride=2)\n        self.conv3b  = conv(64,  64, kernel_size=3, stride=1)\n        self.conv4a  = conv(64,  96, kernel_size=3, stride=2)\n        self.conv4b  = conv(96,  96, kernel_size=3, stride=1)\n        self.conv5a  = conv(96, 128, kernel_size=3, stride=2)\n        self.conv5b  = conv(128,128, kernel_size=3, stride=1)\n        self.conv6a  = conv(128,196, kernel_size=3, stride=2)\n        self.conv6b  = conv(196,196, kernel_size=3, stride=1)\n\n        self.corr    = Correlation(pad_size=md, kernel_size=1, max_displacement=md, stride1=1, stride2=1, corr_multiply=1)\n        self.leakyRELU = nn.LeakyReLU(0.1)\n        \n        nd = (2*md+1)**2\n        dd = np.cumsum([128,128,96,64,32])\n\n        od = nd\n        self.conv6_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv6_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv6_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv6_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv6_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)        \n        self.predict_flow6 = predict_flow(od+dd[4])\n        self.deconv6 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat6 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+128+4\n        self.conv5_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv5_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv5_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv5_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv5_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow5 = predict_flow(od+dd[4]) \n        self.deconv5 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat5 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+96+4\n        self.conv4_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv4_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv4_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv4_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv4_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow4 = predict_flow(od+dd[4]) \n        self.deconv4 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat4 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+64+4\n        self.conv3_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv3_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv3_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv3_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv3_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow3 = predict_flow(od+dd[4]) \n        self.deconv3 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        self.upfeat3 = deconv(od+dd[4], 2, kernel_size=4, stride=2, padding=1) \n        \n        od = nd+32+4\n        self.conv2_0 = conv(od,      128, kernel_size=3, stride=1)\n        self.conv2_1 = conv(od+dd[0],128, kernel_size=3, stride=1)\n        self.conv2_2 = conv(od+dd[1],96,  kernel_size=3, stride=1)\n        self.conv2_3 = conv(od+dd[2],64,  kernel_size=3, stride=1)\n        self.conv2_4 = conv(od+dd[3],32,  kernel_size=3, stride=1)\n        self.predict_flow2 = predict_flow(od+dd[4]) \n        self.deconv2 = deconv(2, 2, kernel_size=4, stride=2, padding=1) \n        \n        self.dc_conv1 = conv(od+dd[4], 128, kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv2 = conv(128,      128, kernel_size=3, stride=1, padding=2,  dilation=2)\n        self.dc_conv3 = conv(128,      128, kernel_size=3, stride=1, padding=4,  dilation=4)\n        self.dc_conv4 = conv(128,      96,  kernel_size=3, stride=1, padding=8,  dilation=8)\n        self.dc_conv5 = conv(96,       64,  kernel_size=3, stride=1, padding=16, dilation=16)\n        self.dc_conv6 = conv(64,       32,  kernel_size=3, stride=1, padding=1,  dilation=1)\n        self.dc_conv7 = predict_flow(32)\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):\n                nn.init.kaiming_normal(m.weight.data, mode='fan_in')\n                if m.bias is not None:\n                    m.bias.data.zero_()\n\n\n    def warp(self, x, flo):\n        \"\"\"\n        warp an image/tensor (im2) back to im1, according to the optical flow\n\n        x: [B, C, H, W] (im2)\n        flo: [B, 2, H, W] flow\n\n        \"\"\"\n        B, C, H, W = x.size()\n        # mesh grid \n        xx = torch.arange(0, W).view(1,-1).repeat(H,1)\n        yy = torch.arange(0, H).view(-1,1).repeat(1,W)\n        xx = xx.view(1,1,H,W).repeat(B,1,1,1)\n        yy = yy.view(1,1,H,W).repeat(B,1,1,1)\n        grid = torch.cat((xx,yy),1).float()\n\n        if x.is_cuda:\n            grid = grid.cuda()\n        vgrid = Variable(grid) + flo\n\n        # scale grid to [-1,1] \n        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:]/max(W-1,1)-1.0\n        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:]/max(H-1,1)-1.0\n\n        vgrid = vgrid.permute(0,2,3,1)        \n        output = nn.functional.grid_sample(x, vgrid)\n        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()\n        mask = nn.functional.grid_sample(mask, vgrid)\n        \n        mask[mask<0.999] = 0\n        mask[mask>0] = 1\n        \n        return output*mask\n\n\n    def forward(self,x):\n        im1 = x[:,:3,:,:]\n        im2 = x[:,3:,:,:]\n        \n        c11 = self.conv1b(self.conv1a(im1))\n        c21 = self.conv1b(self.conv1a(im2))\n        c12 = self.conv2b(self.conv2a(c11))\n        c22 = self.conv2b(self.conv2a(c21))\n        c13 = self.conv3b(self.conv3a(c12))\n        c23 = self.conv3b(self.conv3a(c22))\n        c14 = self.conv4b(self.conv4a(c13))\n        c24 = self.conv4b(self.conv4a(c23))        \n        c15 = self.conv5b(self.conv5a(c14))\n        c25 = self.conv5b(self.conv5a(c24))\n        c16 = self.conv6b(self.conv6a(c15))\n        c26 = self.conv6b(self.conv6a(c25))\n        \n        corr6 = self.corr(c16, c26) \n        corr6 = self.leakyRELU(corr6)        \n        x = torch.cat((corr6, self.conv6_0(corr6)),1)\n        x = torch.cat((self.conv6_1(x), x),1)\n        x = torch.cat((x, self.conv6_2(x)),1)\n        x = torch.cat((x, self.conv6_3(x)),1)\n        x = torch.cat((x, self.conv6_4(x)),1)\n        flow6 = self.predict_flow6(x)\n        up_flow6 = self.deconv6(flow6)\n        up_feat6 = self.upfeat6(x)\n        \n        warp5 = self.warp(c25, up_flow6*0.625)\n        corr5 = self.corr(c15, warp5) \n        corr5 = self.leakyRELU(corr5)\n        x = torch.cat((corr5, c15, up_flow6, up_feat6), 1)\n        x = torch.cat((x, self.conv5_0(x)),1)\n        x = torch.cat((self.conv5_1(x), x),1)\n        x = torch.cat((x, self.conv5_2(x)),1)\n        x = torch.cat((x, self.conv5_3(x)),1)\n        x = torch.cat((x, self.conv5_4(x)),1)\n        flow5 = self.predict_flow5(x)\n        up_flow5 = self.deconv5(flow5)\n        up_feat5 = self.upfeat5(x)\n        \n        warp4 = self.warp(c24, up_flow5*1.25)\n        corr4 = self.corr(c14, warp4)  \n        corr4 = self.leakyRELU(corr4)\n        x = torch.cat((corr4, c14, up_flow5, up_feat5), 1)\n        x = torch.cat((x, self.conv4_0(x)),1)\n        x = torch.cat((self.conv4_1(x), x),1)\n        x = torch.cat((x, self.conv4_2(x)),1)\n        x = torch.cat((x, self.conv4_3(x)),1)\n        x = torch.cat((x, self.conv4_4(x)),1)\n        flow4 = self.predict_flow4(x)\n        up_flow4 = self.deconv4(flow4)\n        up_feat4 = self.upfeat4(x)\n\n        warp3 = self.warp(c23, up_flow4*2.5)\n        corr3 = self.corr(c13, warp3) \n        corr3 = self.leakyRELU(corr3)\n        x = torch.cat((corr3, c13, up_flow4, up_feat4), 1)\n        x = torch.cat((x, self.conv3_0(x)),1)\n        x = torch.cat((self.conv3_1(x), x),1)\n        x = torch.cat((x, self.conv3_2(x)),1)\n        x = torch.cat((x, self.conv3_3(x)),1)\n        x = torch.cat((x, self.conv3_4(x)),1)\n        flow3 = self.predict_flow3(x)\n        up_flow3 = self.deconv3(flow3)\n        up_feat3 = self.upfeat3(x)\n        \n        warp2 = self.warp(c22, up_flow3*5.0) \n        corr2 = self.corr(c12, warp2)\n        corr2 = self.leakyRELU(corr2)\n        x = torch.cat((corr2, c12, up_flow3, up_feat3), 1)\n        x = torch.cat((x, self.conv2_0(x)),1)\n        x = torch.cat((self.conv2_1(x), x),1)\n        x = torch.cat((x, self.conv2_2(x)),1)\n        x = torch.cat((x, self.conv2_3(x)),1)\n        x = torch.cat((x, self.conv2_4(x)),1)\n        flow2 = self.predict_flow2(x)\n \n        x = self.dc_conv4(self.dc_conv3(self.dc_conv2(self.dc_conv1(x))))\n        flow2 += self.dc_conv7(self.dc_conv6(self.dc_conv5(x)))\n        \n        if self.training:\n            return flow2,flow3,flow4,flow5,flow6\n        else:\n            return flow2\n\n\n\n\n\ndef pwc_dc_net(path=None):\n\n    model = PWCDCNet()\n    if path is not None:\n        data = torch.load(path)\n        if 'state_dict' in data.keys():\n            model.load_state_dict(data['state_dict'])\n        else:\n            model.load_state_dict(data)\n    return model\n\n\n\n\ndef pwc_dc_net_old(path=None):\n\n    model = PWCDCNet_old()\n    if path is not None:\n        data = torch.load(path)\n        if 'state_dict' in data.keys():\n            model.load_state_dict(data['state_dict'])\n        else:\n            model.load_state_dict(data)\n    return model\n"
  },
  {
    "path": "PWCNet/models/__init__.py",
    "content": "from .PWCNet import *\n"
  },
  {
    "path": "README.md",
    "content": "# DAIN (Depth-Aware Video Frame Interpolation)\n[Project](https://sites.google.com/view/wenbobao/dain) **|** [Paper](http://arxiv.org/abs/1904.00830)\n\n[Wenbo Bao](https://sites.google.com/view/wenbobao/home),\n[Wei-Sheng Lai](http://graduatestudents.ucmerced.edu/wlai24/), \n[Chao Ma](https://sites.google.com/site/chaoma99/),\nXiaoyun Zhang, \nZhiyong Gao, \nand [Ming-Hsuan Yang](http://faculty.ucmerced.edu/mhyang/)\n\nIEEE Conference on Computer Vision and Pattern Recognition, Long Beach, CVPR 2019\n\nThis work is developed based on our TPAMI work [MEMC-Net](https://github.com/baowenbo/MEMC-Net), where we propose the adaptive warping layer. Please also consider referring to it.\n\n### Table of Contents\n1. [Introduction](#introduction)\n1. [Citation](#citation)\n1. [Requirements and Dependencies](#requirements-and-dependencies)\n1. [Installation](#installation)\n1. [Testing Pre-trained Models](#testing-pre-trained-models)\n1. [Downloading Results](#downloading-results)\n1. [Slow-motion Generation](#slow-motion-generation)\n1. [Training New Models](#training-new-models)\n1. [Google Colab Demo](#google-colab-demo)\n\n### Introduction\nWe propose the **D**epth-**A**ware video frame **IN**terpolation (**DAIN**) model to explicitly detect the occlusion by exploring the depth cue.\nWe develop a depth-aware flow projection layer to synthesize intermediate flows that preferably sample closer objects than farther ones.\nOur method achieves state-of-the-art performance on the Middlebury dataset. \nWe provide videos [here](https://www.youtube.com/watch?v=-f8f0igQi5I&t=5s).\n\n<!--![teaser](http://vllab.ucmerced.edu/wlai24/LapSRN/images/emma_text.gif)-->\n\n<!--[![teaser](https://img.youtube.com/vi/icJ0WbPsE20/0.jpg)](https://www.youtube.com/watch?v=icJ0WbPsE20&feature=youtu.be)\n<!--<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/icJ0WbPsE20\" frameborder=\"0\" allow=\"accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen></iframe>\n![teaser](http://vllab1.ucmerced.edu/~wenbobao/DAIN/kart-turn_compare.gif)\n\n\n<!--哈哈我是注释，不会在浏览器中显示。\nBeanbags\nhttps://drive.google.com/open?id=170vdxANGoNKO5_8MYOuiDvoIXzucv7HW\nDimentrodon\nhttps://drive.google.com/open?id=14n7xvb9hjTKqfcr7ZpEFyfMvx6E8NhD_\nDogDance\nhttps://drive.google.com/open?id=1YWAyAJ3T48fMFv2K8j8wIVcmQm39cRof\nGrove2\nhttps://drive.google.com/open?id=1sJLwdQdL6JYXSQo_Bev0aQMleWacxCsN\nGrove3\nhttps://drive.google.com/open?id=1jGj3UdGppoJO02Of8ZaNXqDH4fnXuQ8O\nHydrangea\nhttps://drive.google.com/open?id=1_4kVlhvrmCv54aXi7vZMk3-FtRQF7s0s\nMiniCooper\nhttps://drive.google.com/open?id=1pWHtyBSZsOTC7NTVdHTrv1W-dxa95BLo\nRubberWhale\nhttps://drive.google.com/open?id=1korbXsGpSgJn7THBHkLRVrJMtCt5YZPB\nUrban2\nhttps://drive.google.com/open?id=1v57RMm9x5vM36mCgPy5hresXDZWtw3Vs\nUrban3\nhttps://drive.google.com/open?id=1LMwSU0PrG4_GaDjWRI2v9hvWpYwzRKca\nVenus\nhttps://drive.google.com/open?id=1piPnEexuHaiAr4ZzWSAxGi1u1Xo_6vPp\nWalking\nhttps://drive.google.com/open?id=1CgCLmVC_WTVTAcA_IdWbLqR8MS18zHoa\n-->\n\n<p float=\"middle\">\n<img src=\"https://drive.google.com/uc?export=view&id=1YWAyAJ3T48fMFv2K8j8wIVcmQm39cRof\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1CgCLmVC_WTVTAcA_IdWbLqR8MS18zHoa\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1pWHtyBSZsOTC7NTVdHTrv1W-dxa95BLo\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=170vdxANGoNKO5_8MYOuiDvoIXzucv7HW\" width=\"200\"/>\n</p>\n\n<p float=\"middle\">\n<img src=\"https://drive.google.com/uc?export=view&id=1sJLwdQdL6JYXSQo_Bev0aQMleWacxCsN\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1jGj3UdGppoJO02Of8ZaNXqDH4fnXuQ8O\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1v57RMm9x5vM36mCgPy5hresXDZWtw3Vs\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1LMwSU0PrG4_GaDjWRI2v9hvWpYwzRKca\" width=\"200\"/>\n</p>\n\n<p float=\"middle\">\n<img src=\"https://drive.google.com/uc?export=view&id=1piPnEexuHaiAr4ZzWSAxGi1u1Xo_6vPp\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1korbXsGpSgJn7THBHkLRVrJMtCt5YZPB\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=1_4kVlhvrmCv54aXi7vZMk3-FtRQF7s0s\" width=\"200\"/>\n<img src=\"https://drive.google.com/uc?export=view&id=14n7xvb9hjTKqfcr7ZpEFyfMvx6E8NhD_\" width=\"200\"/>\n</p>\n\n### Citation\nIf you find the code and datasets useful in your research, please cite:\n\n    @inproceedings{DAIN,\n        author    = {Bao, Wenbo and Lai, Wei-Sheng and Ma, Chao and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan}, \n        title     = {Depth-Aware Video Frame Interpolation}, \n        booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},\n        year      = {2019}\n    }\n    @article{MEMC-Net,\n         title={MEMC-Net: Motion Estimation and Motion Compensation Driven Neural Network for Video Interpolation and Enhancement},\n         author={Bao, Wenbo and Lai, Wei-Sheng, and Zhang, Xiaoyun and Gao, Zhiyong and Yang, Ming-Hsuan},\n         journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},\n         doi={10.1109/TPAMI.2019.2941941},\n         year={2018}\n    }\n\n### Requirements and Dependencies\n- Ubuntu (We test with Ubuntu = 16.04.5 LTS)\n- Python (We test with Python = 3.6.8 in Anaconda3 = 4.1.1)\n- Cuda & Cudnn (We test with Cuda = 9.0 and Cudnn = 7.0)\n- PyTorch (The customized depth-aware flow projection and other layers require ATen API in PyTorch = 1.0.0)\n- GCC (Compiling PyTorch 1.0.0 extension files (.c/.cu) requires gcc = 4.9.1 and nvcc = 9.0 compilers)\n- NVIDIA GPU (We use Titan X (Pascal) with compute = 6.1, but we support compute_50/52/60/61 devices, should you have devices with higher compute capability, please revise [this](https://github.com/baowenbo/DAIN/blob/master/my_package/DepthFlowProjection/setup.py))\n\n### Installation\nDownload repository:\n\n    $ git clone https://github.com/baowenbo/DAIN.git\n\nBefore building Pytorch extensions, be sure you have `pytorch >= 1.0.0`:\n    \n    $ python -c \"import torch; print(torch.__version__)\"\n    \nGenerate our PyTorch extensions:\n    \n    $ cd DAIN\n    $ cd my_package \n    $ ./build.sh\n\nGenerate the Correlation package required by [PWCNet](https://github.com/NVlabs/PWC-Net/tree/master/PyTorch/external_packages/correlation-pytorch-master):\n    \n    $ cd ../PWCNet/correlation_package_pytorch1_0\n    $ ./build.sh\n\n\n### Testing Pre-trained Models\nMake model weights dir and Middlebury dataset dir:\n\n    $ cd DAIN\n    $ mkdir model_weights\n    $ mkdir MiddleBurySet\n    \nDownload pretrained models, \n\n    $ cd model_weights\n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best.pth\n    \nand Middlebury dataset:\n    \n    $ cd ../MiddleBurySet\n    $ wget http://vision.middlebury.edu/flow/data/comp/zip/other-color-allframes.zip\n    $ unzip other-color-allframes.zip\n    $ wget http://vision.middlebury.edu/flow/data/comp/zip/other-gt-interp.zip\n    $ unzip other-gt-interp.zip\n    $ cd ..\n\npreinstallations:\n\n    $ cd PWCNet/correlation_package_pytorch1_0\n    $ sh build.sh\n    $ cd ../my_package\n    $ sh build.sh\n    $ cd ..\n\nWe are good to go by:\n\n    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py\n\nThe interpolated results are under `MiddleBurySet/other-result-author/[random number]/`, where the `random number` is used to distinguish different runnings. \n\n### Downloading Results\nOur DAIN model achieves the state-of-the-art performance on the UCF101, Vimeo90K, and Middlebury ([*eval*](http://vision.middlebury.edu/flow/eval/results/results-n1.php) and *other*).\nDownload our interpolated results with:\n    \n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/UCF101_DAIN.zip\n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Vimeo90K_interp_DAIN.zip\n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_eval_DAIN.zip\n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/Middlebury_other_DAIN.zip\n    \n    \n### Slow-motion Generation\nOur model is fully capable of generating slow-motion effect with minor modification on the network architecture.\nRun the following code by specifying `time_step = 0.25` to generate x4 slow-motion effect:\n\n    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.25\n\nor set `time_step` to `0.125` or `0.1` as follows \n\n    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.125\n    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.1\nto generate x8 and x10 slow-motion respectively. Or if you would like to have x100 slow-motion for a little fun.\n    \n    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury_slowmotion.py --netName DAIN_slowmotion --time_step 0.01\n\nYou may also want to create gif animations by:\n    \n    $ cd MiddleBurySet/other-result-author/[random number]/Beanbags\n    $ convert -delay 1 *.png -loop 0 Beanbags.gif //1*10ms delay \n\nHave fun and enjoy yourself! \n\n\n### Training New Models\nDownload the Vimeo90K triplet dataset for video frame interpolation task, also see [here](https://github.com/anchen1011/toflow/blob/master/download_dataset.sh) by [Xue et al., IJCV19](https://arxiv.org/abs/1711.09078).\n    \n    $ cd DAIN\n    $ mkdir /path/to/your/dataset & cd /path/to/your/dataset \n    $ wget http://data.csail.mit.edu/tofu/dataset/vimeo_triplet.zip\n    $ unzip vimeo_triplet.zip\n    $ rm vimeo_triplet.zip\n\nDownload the pretrained MegaDepth and PWCNet models\n    \n    $ cd MegaDepth/checkpoints/test_local\n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/best_generalization_net_G.pth\n    $ cd ../../../PWCNet\n    $ wget http://vllab1.ucmerced.edu/~wenbobao/DAIN/pwc_net.pth.tar\n    $ cd  ..\n    \nRun the training script:\n\n    $ CUDA_VISIBLE_DEVICES=0 python train.py --datasetPath /path/to/your/dataset --batch_size 1 --save_which 1 --lr 0.0005 --rectify_lr 0.0005 --flow_lr_coe 0.01 --occ_lr_coe 0.0 --filter_lr_coe 1.0 --ctx_lr_coe 1.0 --alpha 0.0 1.0 --patience 4 --factor 0.2\n    \nThe optimized models will be saved to the `model_weights/[random number]` directory, where [random number] is generated for different runs.\n\nReplace the pre-trained `model_weights/best.pth` model with the newly trained `model_weights/[random number]/best.pth` model.\nThen test the new model by executing: \n\n    $ CUDA_VISIBLE_DEVICES=0 python demo_MiddleBury.py\n\n### Google Colab Demo\nThis is a modification of DAIN that allows the usage of Google Colab and is able to do a full demo interpolation from a source video to a target video.\n\nOriginal Notebook File by btahir can be found [here](https://github.com/baowenbo/DAIN/issues/44).\n\nTo use the Colab, follow these steps:\n\n- Download the `Colab_DAIN.ipynb` file ([link](https://raw.githubusercontent.com/baowenbo/DAIN/master/Colab_DAIN.ipynb)).\n- Visit Google Colaboratory ([link](https://colab.research.google.com/))\n- Select the \"Upload\" option, and upload the `.ipynb` file\n- Start running the cells one by one, following the instructions.\n\nColab file authors: [Styler00Dollar](https://github.com/styler00dollar) and [Alpha](https://github.com/AlphaGit).\n\n### Contact\n[Wenbo Bao](mailto:bwb0813@gmail.com); [Wei-Sheng (Jason) Lai](mailto:phoenix104104@gmail.com)\n\n### License\nSee [MIT License](https://github.com/baowenbo/DAIN/blob/master/LICENSE)\n"
  },
  {
    "path": "Resblock/BasicBlock.py",
    "content": "import torch.nn as nn\nimport math\nimport torch.utils.model_zoo as model_zoo\nimport torch.nn.init as weight_init\nimport torch\n__all__ = ['MultipleBasicBlock','MultipleBasicBlock_4']\ndef conv3x3(in_planes, out_planes, dilation = 1, stride=1):\n    \"3x3 convolution with padding\"\n    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,\n                     padding=int(dilation*(3-1)/2), dilation=dilation, bias=False)\nclass BasicBlock(nn.Module):\n    expansion = 1\n\n    def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):\n        super(BasicBlock, self).__init__()\n        self.conv1 = conv3x3(inplanes, planes,dilation, stride)\n        # self.bn1 = nn.BatchNorm2d(planes)\n        self.relu = nn.ReLU(inplace=True)\n        self.conv2 = conv3x3(planes, planes)\n        # self.bn2 = nn.BatchNorm2d(planes)\n        self.downsample = downsample\n        self.stride = stride\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d):\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n                m.weight.data.normal_(0, math.sqrt(2. / n))\n                # weight_init.xavier_normal()\n            elif isinstance(m, nn.BatchNorm2d):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\n\n    def forward(self, x):\n        residual = x\n\n        out = self.conv1(x)\n        # out = self.bn1(out)\n        out = self.relu(out)\n\n        out = self.conv2(out)\n        # out = self.bn2(out)\n\n        if self.downsample is not None:\n            residual = self.downsample(x)\n\n        out += residual\n        out = self.relu(out)\n\n        return out\nclass MultipleBasicBlock(nn.Module):\n\n    def __init__(self,input_feature,\n                 block, num_blocks,\n                 intermediate_feature = 64, dense = True):\n        super(MultipleBasicBlock, self).__init__()\n        self.dense = dense\n        self.num_block = num_blocks\n        self.intermediate_feature = intermediate_feature\n\n        self.block1= nn.Sequential(*[\n            nn.Conv2d(input_feature, intermediate_feature,\n                      kernel_size=7, stride=1, padding=3, bias=True),\n            nn.ReLU(inplace=True)\n        ])\n\n        # for i in range(1, num_blocks):\n        self.block2 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=2 else None\n        self.block3 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=3 else None\n        self.block4 = block(intermediate_feature, intermediate_feature, dilation = 1) if num_blocks>=4 else None\n        self.block5 = nn.Sequential(*[nn.Conv2d(intermediate_feature, 3 , (3, 3), 1, (1, 1))])\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d):\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n                m.weight.data.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2d):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\n\n    def forward(self, x):\n        x = self.block1(x)\n        x = self.block2(x) if self.num_block>=2 else x\n        x = self.block3(x) if self.num_block>=3 else x\n        x = self.block4(x) if self.num_block== 4 else x\n        x = self.block5(x)\n        return x\n\ndef MultipleBasicBlock_4(input_feature,intermediate_feature = 64):\n    model = MultipleBasicBlock(input_feature,\n                               BasicBlock,4 ,\n                               intermediate_feature)\n    return model\n\n\nif __name__ == '__main__':\n\n    # x= Variable(torch.randn(2,3,224,448))\n    # model =    S2DF(BasicBlock,3,True)\n    # y = model(x)\n    model = MultipleBasicBlock(200, BasicBlock,4)\n    model = BasicBlock(64,64,1)\n    # y = model(x)\n    exit(0)"
  },
  {
    "path": "Resblock/__init__.py",
    "content": "from   .BasicBlock import *"
  },
  {
    "path": "S2D_models/S2DF.py",
    "content": "import torch.nn as nn\nimport math\nimport torch.utils.model_zoo as model_zoo\n\nimport torch\n# __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',\n           # 'resnet152','resnet18_conv1']\n__all__ = ['S2DF','S2DF_3dense','S2DF_3dense_nodilation',\n           'S2DF_3last','S2DF_2dense', 'BasicBlock']\n\nmodel_urls = {\n    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',\n    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',\n    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',\n    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',\n    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',\n}\n\n\ndef conv3x3(in_planes, out_planes, dilation = 1, stride=1):\n    \"3x3 convolution with padding\"\n    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,\n                     padding=int(dilation*(3-1)/2), dilation=dilation, bias=False)\n\n\nclass BasicBlock(nn.Module):\n    expansion = 1\n\n    def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):\n        super(BasicBlock, self).__init__()\n        self.conv1 = conv3x3(inplanes, planes,dilation, stride)\n        # self.bn1 = nn.BatchNorm2d(planes)\n        self.relu = nn.ReLU(inplace=True)\n        self.conv2 = conv3x3(planes, planes)\n        # self.bn2 = nn.BatchNorm2d(planes)\n        self.downsample = downsample\n        self.stride = stride\n\n    def forward(self, x):\n        residual = x\n\n        out = self.conv1(x)\n        # out = self.bn1(out)\n        out = self.relu(out)\n\n        out = self.conv2(out)\n        # out = self.bn2(out)\n\n        if self.downsample is not None:\n            residual = self.downsample(x)\n\n        out += residual\n        out = self.relu(out)\n\n        return out\n\n\nclass Bottleneck(nn.Module):\n    expansion = 4\n\n    def __init__(self, inplanes, planes, dilation = 1, stride=1, downsample=None):\n        super(Bottleneck, self).__init__()\n        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)\n        # self.bn1 = nn.BatchNorm2d(planes)\n        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,\n                               padding=int(dilation*(3-1)/2), dilation = dilation, bias=False)\n        # self.bn2 = nn.BatchNorm2d(planes)\n        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)\n        # self.bn3 = nn.BatchNorm2d(planes * 4)\n        self.relu = nn.ReLU(inplace=True)\n        self.downsample = downsample\n        self.stride = stride\n\n    def forward(self, x):\n        residual = x\n\n        out = self.conv1(x)\n        # out = self.bn1(out)\n        out = self.relu(out)\n\n        out = self.conv2(out)\n        # out = self.bn2(out)\n        out = self.relu(out)\n\n        out = self.conv3(out)\n        # out = self.bn3(out)\n\n        if self.downsample is not None:\n            residual = self.downsample(x)\n\n        out += residual\n        out = self.relu(out)\n\n        return out\n\n\nclass S2DF(nn.Module):\n\n    def __init__(self, block, num_blocks,dense = True,dilation=True):\n        self.inplanes = 64\n        super(S2DF, self).__init__()\n        self.dense = dense\n        self.num_block = num_blocks\n        assert(num_blocks>=1 and num_blocks<=4)\n        self.block1 = nn.Sequential(*[\n            nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False),\n            nn.ReLU(inplace=True)\n        ])\n\n        self.dilation = dilation\n        # for i in range(1, num_blocks):\n        self.block2 = block(self.inplanes, 64, dilation = 4 if dilation else 1) if num_blocks>=2 else None\n        self.block3 = block(self.inplanes, 64, dilation = 8 if dilation else 1) if num_blocks>=3 else None\n        self.block4 = block(self.inplanes, 64, dilation = 16 if dilation else 1) if num_blocks>=4 else None\n\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d):\n                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n                m.weight.data.normal_(0, math.sqrt(2. / n))\n            elif isinstance(m, nn.BatchNorm2d):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\n\n    def forward(self, x):\n        y = []\n\n        y.append(x) #raw feature\n        x = self.block1(x)\n        if (self.num_block > 1 and self.dense) or self.num_block == 1:\n            y.append(x)\n\n        x = self.block2(x) if self.num_block>=2 else x\n        if (self.num_block > 2 and self.dense) or self.num_block == 2:\n            y.append(x)\n\n        x = self.block3(x) if self.num_block>=3 else x\n        if (self.num_block > 3 and self.dense) or self.num_block == 3:\n            y.append(x)\n\n        x = self.block4(x) if self.num_block== 4 else x\n        if self.num_block == 4 :\n            y.append(x)\n\n        return torch.cat(y,dim=1)\n\n\nclass S2DFsim(nn.Module):\n\n    def __init__(self, block, num_blocks,dense = True,dilation=True):\n        self.inplanes = 64\n        super(S2DFsim, self).__init__()\n        self.dense = dense\n        self.num_block = num_blocks\n        assert(num_blocks>=1 and num_blocks<=4)\n        self.block1 = nn.Sequential(*[\n            nn.Conv2d(3, 64, kernel_size=7, stride=1, padding=3, bias=False),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n        ])\n\n        self.dilation = dilation\n        # for i in range(1, num_blocks):\n        self.block2 = nn.Sequential(*[\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n        ]) if num_blocks >= 2 else None\n        self.block3 =  nn.Sequential(*[\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n        ]) if num_blocks >= 3 else None\n        self.block4 = nn.Sequential(*[\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n            nn.ReLU(inplace=True),\n            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),\n        ]) if num_blocks >= 4 else None\n\n        # for m in self.modules():\n        #     if isinstance(m, nn.Conv2d):\n        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n        #         m.weight.data.normal_(0, math.sqrt(2. / n))\n        #     elif isinstance(m, nn.BatchNorm2d):\n        #         m.weight.data.fill_(1)\n        #         m.bias.data.zero_()\n\n    def forward(self, x):\n        y = []\n\n        y.append(x) #raw feature\n        x = self.block1(x)\n        if (self.num_block > 1 and self.dense) or self.num_block == 1:\n            y.append(x)\n\n        x = self.block2(x) if self.num_block>=2 else x\n        if (self.num_block > 2 and self.dense) or self.num_block == 2:\n            y.append(x)\n\n        x = self.block3(x) if self.num_block>=3 else x\n        if (self.num_block > 3 and self.dense) or self.num_block == 3:\n            y.append(x)\n\n        x = self.block4(x) if self.num_block== 4 else x\n        if self.num_block == 4 :\n            y.append(x)\n\n        return torch.cat(y,dim=1)\ndef S2DF_3dense_nodilation():\n    model = S2DFsim(None,3,dense=True,dilation=False)\n    return model\ndef S2DF_3dense():\n    model = S2DF(BasicBlock,3,dense=True)\n    return model\ndef S2DF_3last():\n    model = S2DF(BasicBlock,3,dense=False)\n    return model\ndef S2DF_2dense():\n    model = S2DF(BasicBlock,2,dense=True)\n    return model\n\n\n\nfrom torch.autograd import Variable\n\nif __name__ == '__main__':\n\n    x= Variable(torch.randn(2,3,224,448))\n    # model =    S2DF(BasicBlock,3,True)\n    # y = model(x)\n\n    model = S2DF(BasicBlock,4,False)\n    y = model(x)\n    exit(0)\n"
  },
  {
    "path": "S2D_models/__init__.py",
    "content": "from .S2DF import *"
  },
  {
    "path": "Stack.py",
    "content": "\nclass Stack:\n    def __init__(self):\n        self.stack = []\n    def pop(self):\n        if self.is_empty():\n            return None\n        else:\n            return self.stack.pop()\n    def push(self,val):\n        return self.stack.append(val)\n    def peak(self):\n        if self.is_empty():\n            return None\n        else:\n            return self.stack[-1]\n    def size(self):\n        return len(self.stack)\n    def is_empty(self):\n        return self.size() == 0"
  },
  {
    "path": "balancedsampler.py",
    "content": "from torch.utils.data.sampler import Sampler\nimport torch\n\nclass RandomBalancedSampler(Sampler):\n    \"\"\"Samples elements randomly, with an arbitrary size, independant from dataset length.\n    this is a balanced sampling that will sample the whole dataset with a random permutation.\n\n    Arguments:\n        data_source (Dataset): dataset to sample from\n    \"\"\"\n\n    def __init__(self, data_source, epoch_size):\n        self.data_size = len(data_source)\n        self.epoch_size = epoch_size\n        self.index = 0\n\n    def __next__(self):\n        if self.index == 0:\n            #re-shuffle the sampler\n            self.indices = torch.randperm(self.data_size)\n        self.index = (self.index+1)%self.data_size\n        return self.indices[self.index]\n\n    def next(self):\n        return self.__next__()\n\n    def __iter__(self):\n        return self\n\n    def __len__(self):\n        return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size\n\nclass SequentialBalancedSampler(Sampler):\n    \"\"\"Samples elements dequentially, with an arbitrary size, independant from dataset length.\n    this is a balanced sampling that will sample the whole dataset before resetting it.\n\n    Arguments:\n        data_source (Dataset): dataset to sample from\n    \"\"\"\n\n    def __init__(self, data_source, epoch_size):\n        self.data_size = len(data_source)\n        self.epoch_size = epoch_size\n        self.index = 0\n\n    def __next__(self):\n        self.index = (self.index+1)%self.data_size\n        return self.index\n\n    def next(self):\n        return self.__next__()\n\n    def __iter__(self):\n        return self\n\n    def __len__(self):\n        return min(self.data_size,self.epoch_size) if self.epoch_size>0 else self.data_size\n"
  },
  {
    "path": "colab_interpolate.py",
    "content": "import time\nimport os\nfrom torch.autograd import Variable\nimport torch\nimport numpy as np\nimport numpy\nimport networks\nfrom my_args import args\nfrom imageio import imread, imsave\nfrom AverageMeter import  *\nimport shutil\nimport datetime\ntorch.backends.cudnn.benchmark = True\n\nmodel = networks.__dict__[args.netName](\n                                    channel = args.channels,\n                                    filter_size = args.filter_size,\n                                    timestep = args.time_step,\n                                    training = False)\n\nif args.use_cuda:\n    model = model.cuda()\n\nmodel_path = './model_weights/best.pth'\nif not os.path.exists(model_path):\n    print(\"*****************************************************************\")\n    print(\"**** We couldn't load any trained weights ***********************\")\n    print(\"*****************************************************************\")\n    exit(1)\n\nif args.use_cuda:\n    pretrained_dict = torch.load(model_path)\nelse:\n    pretrained_dict = torch.load(model_path, map_location=lambda storage, loc: storage)\n\nmodel_dict = model.state_dict()\n# 1. filter out unnecessary keys\npretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}\n# 2. overwrite entries in the existing state dict\nmodel_dict.update(pretrained_dict)\n# 3. load the new state dict\nmodel.load_state_dict(model_dict)\n# 4. release the pretrained dict for saving memory\npretrained_dict = []\n\nmodel = model.eval() # deploy mode\n\nframes_dir = args.frame_input_dir\noutput_dir = args.frame_output_dir\n\ntimestep = args.time_step\ntime_offsets = [kk * timestep for kk in range(1, int(1.0 / timestep))]\n\ninput_frame = args.start_frame - 1\nloop_timer = AverageMeter()\n\nfinal_frame = args.end_frame\n\ntorch.set_grad_enabled(False)\n\n# we want to have input_frame between (start_frame-1) and (end_frame-2)\n# this is because at each step we read (frame) and (frame+1)\n# so the last iteration will actuall be (end_frame-1) and (end_frame)\nwhile input_frame < final_frame - 1:\n    input_frame += 1\n\n    start_time = time.time()\n\n    filename_frame_1 = os.path.join(frames_dir, f'{input_frame:0>5d}.png')\n    filename_frame_2 = os.path.join(frames_dir, f'{input_frame+1:0>5d}.png')\n\n    X0 = torch.from_numpy(np.transpose(imread(filename_frame_1), (2,0,1)).astype(\"float32\") / 255.0).type(args.dtype)\n    X1 = torch.from_numpy(np.transpose(imread(filename_frame_2), (2,0,1)).astype(\"float32\") / 255.0).type(args.dtype)\n\n    assert (X0.size(1) == X1.size(1))\n    assert (X0.size(2) == X1.size(2))\n\n    intWidth = X0.size(2)\n    intHeight = X0.size(1)\n    channels = X0.size(0)\n    if not channels == 3:\n        print(f\"Skipping {filename_frame_1}-{filename_frame_2} -- expected 3 color channels but found {channels}.\")\n        continue\n\n    if intWidth != ((intWidth >> 7) << 7):\n        intWidth_pad = (((intWidth >> 7) + 1) << 7)  # more than necessary\n        intPaddingLeft = int((intWidth_pad - intWidth) / 2)\n        intPaddingRight = intWidth_pad - intWidth - intPaddingLeft\n    else:\n        intPaddingLeft = 32\n        intPaddingRight= 32\n\n    if intHeight != ((intHeight >> 7) << 7):\n        intHeight_pad = (((intHeight >> 7) + 1) << 7)  # more than necessary\n        intPaddingTop = int((intHeight_pad - intHeight) / 2)\n        intPaddingBottom = intHeight_pad - intHeight - intPaddingTop\n    else:\n        intPaddingTop = 32\n        intPaddingBottom = 32\n\n    pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight, intPaddingTop, intPaddingBottom])\n\n    X0 = Variable(torch.unsqueeze(X0,0))\n    X1 = Variable(torch.unsqueeze(X1,0))\n    X0 = pader(X0)\n    X1 = pader(X1)\n\n    if args.use_cuda:\n        X0 = X0.cuda()\n        X1 = X1.cuda()\n\n    y_s, offset, filter = model(torch.stack((X0, X1),dim = 0))\n    y_ = y_s[args.save_which]\n\n    if args.use_cuda:\n        X0 = X0.data.cpu().numpy()\n        if not isinstance(y_, list):\n            y_ = y_.data.cpu().numpy()\n        else:\n            y_ = [item.data.cpu().numpy() for item in y_]\n        offset = [offset_i.data.cpu().numpy() for offset_i in offset]\n        filter = [filter_i.data.cpu().numpy() for filter_i in filter]  if filter[0] is not None else None\n        X1 = X1.data.cpu().numpy()\n    else:\n        X0 = X0.data.numpy()\n        if not isinstance(y_, list):\n            y_ = y_.data.numpy()\n        else:\n            y_ = [item.data.numpy() for item in y_]\n        offset = [offset_i.data.numpy() for offset_i in offset]\n        filter = [filter_i.data.numpy() for filter_i in filter]\n        X1 = X1.data.numpy()\n\n    X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n    y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight,\n                                intPaddingLeft:intPaddingLeft+intWidth], (1, 2, 0)) for item in y_]\n    offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]\n    filter = [np.transpose(\n        filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],\n        (1, 2, 0)) for filter_i in filter]  if filter is not None else None\n    X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n\n    interpolated_frame_number = 0\n    shutil.copy(filename_frame_1, os.path.join(output_dir, f\"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png\"))\n    for item, time_offset in zip(y_, time_offsets):\n        interpolated_frame_number += 1\n        output_frame_file_path = os.path.join(output_dir, f\"{input_frame:0>5d}{interpolated_frame_number:0>3d}.png\")\n        imsave(output_frame_file_path, np.round(item).astype(numpy.uint8))\n\n    end_time = time.time()\n    loop_timer.update(end_time - start_time)\n\n    frames_left = final_frame - input_frame\n    estimated_seconds_left = frames_left * loop_timer.avg\n    estimated_time_left = datetime.timedelta(seconds=estimated_seconds_left)\n    print(f\"****** Processed frame {input_frame} | Time per frame (avg): {loop_timer.avg:2.2f}s | Time left: {estimated_time_left} ******************\" )\n\n# Copying last frame\nlast_frame_filename = os.path.join(frames_dir, str(str(final_frame).zfill(5))+'.png')\nshutil.copy(last_frame_filename, os.path.join(output_dir, f\"{final_frame:0>5d}{0:0>3d}.png\"))\n\nprint(\"Finished processing images.\")\n"
  },
  {
    "path": "datasets/Vimeo_90K_interp.py",
    "content": "import os.path\nimport random\n# import glob\nimport math\nfrom .listdatasets import ListDataset,Vimeo_90K_loader\n\n\ndef make_dataset(root, list_file):\n    raw_im_list = open(os.path.join(root, list_file)).read().splitlines()\n    # the last line is invalid in test set.\n    # print(\"The last sample is : \" + raw_im_list[-1])\n    raw_im_list = raw_im_list[:-1]\n    assert len(raw_im_list) > 0\n    random.shuffle(raw_im_list)\n\n    return  raw_im_list\n\ndef Vimeo_90K_interp(root, split=1.0, single=False, task = 'interp' ):\n    train_list = make_dataset(root,\"tri_trainlist.txt\")\n    test_list = make_dataset(root,\"tri_testlist.txt\")\n    train_dataset = ListDataset(root, train_list, loader=Vimeo_90K_loader)\n    test_dataset = ListDataset(root, test_list, loader=Vimeo_90K_loader)\n    return train_dataset, test_dataset"
  },
  {
    "path": "datasets/__init__.py",
    "content": "from .Vimeo_90K_interp import Vimeo_90K_interp\n\n__all__ = (\n           'Vimeo_90K_interp',\n)\n\n# Vimeo_90K = \"/tmp4/wenbobao_data/vimeo_triplet\"\n"
  },
  {
    "path": "datasets/listdatasets.py",
    "content": "import torch.utils.data as data\nimport os\nimport os.path\nfrom scipy.ndimage import imread\nimport numpy as np\nimport random\n\ndef Vimeo_90K_loader(root, im_path, input_frame_size = (3, 256, 448), output_frame_size = (3, 256, 448), data_aug = True):\n\n\n    root = os.path.join(root,'sequences',im_path)\n\n    if data_aug and random.randint(0, 1):\n        path_pre2 = os.path.join(root,  \"im1.png\")\n        path_mid = os.path.join(root,  \"im2.png\")\n        path_pre1 = os.path.join(root,  \"im3.png\")\n    else:\n        path_pre1 = os.path.join(root,  \"im1.png\")\n        path_mid = os.path.join(root,  \"im2.png\")\n        path_pre2 = os.path.join(root,  \"im3.png\")\n\n    im_pre2 = imread(path_pre2)\n    im_pre1 = imread(path_pre1)\n    im_mid = imread(path_mid)\n\n    h_offset = random.choice(range(256 - input_frame_size[1] + 1))\n    w_offset = random.choice(range(448 - input_frame_size[2] + 1))\n\n    im_pre2 = im_pre2[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]\n    im_pre1 = im_pre1[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]\n    im_mid = im_mid[h_offset:h_offset + input_frame_size[1], w_offset: w_offset + input_frame_size[2], :]\n\n    if data_aug:\n        if random.randint(0, 1):\n            im_pre2 = np.fliplr(im_pre2)\n            im_mid = np.fliplr(im_mid)\n            im_pre1 = np.fliplr(im_pre1)\n        if random.randint(0, 1):\n            im_pre2 = np.flipud(im_pre2)\n            im_mid = np.flipud(im_mid)\n            im_pre1 = np.flipud(im_pre1)\n\n    X0 = np.transpose(im_pre1,(2,0,1))\n    X2 = np.transpose(im_pre2, (2, 0, 1))\n\n    y = np.transpose(im_mid, (2, 0, 1))\n    return X0.astype(\"float32\")/ 255.0, \\\n            X2.astype(\"float32\")/ 255.0,\\\n            y.astype(\"float32\")/ 255.0\n\n\n\nclass ListDataset(data.Dataset):\n    def __init__(self, root, path_list,  loader=Vimeo_90K_loader):\n\n        self.root = root\n        self.path_list = path_list\n        self.loader = loader\n\n    def __getitem__(self, index):\n        path = self.path_list[index]\n        # print(path)\n        image_0,image_2,image_1 = self.loader(self.root, path)\n        return image_0,image_2,image_1\n\n    def __len__(self):\n        return len(self.path_list)\n"
  },
  {
    "path": "demo_MiddleBury.py",
    "content": "import time\nimport os\nfrom torch.autograd import Variable\nimport math\nimport torch\n\nimport random\nimport numpy as np\nimport numpy\nimport networks\nfrom my_args import  args\n\nfrom scipy.misc import imread, imsave\nfrom AverageMeter import  *\n\ntorch.backends.cudnn.benchmark = True # to speed up the\n\n\nDO_MiddleBurryOther = True\nMB_Other_DATA = \"./MiddleBurySet/other-data/\"\nMB_Other_RESULT = \"./MiddleBurySet/other-result-author/\"\nMB_Other_GT = \"./MiddleBurySet/other-gt-interp/\"\nif not os.path.exists(MB_Other_RESULT):\n    os.mkdir(MB_Other_RESULT)\n\n\n\nmodel = networks.__dict__[args.netName](channel=args.channels,\n                            filter_size = args.filter_size ,\n                            timestep=args.time_step,\n                            training=False)\n\nif args.use_cuda:\n    model = model.cuda()\n\nargs.SAVED_MODEL = './model_weights/best.pth'\nif os.path.exists(args.SAVED_MODEL):\n    print(\"The testing model weight is: \" + args.SAVED_MODEL)\n    if not args.use_cuda:\n        pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)\n        # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))\n    else:\n        pretrained_dict = torch.load(args.SAVED_MODEL)\n        # model.load_state_dict(torch.load(args.SAVED_MODEL))\n\n    model_dict = model.state_dict()\n    # 1. filter out unnecessary keys\n    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}\n    # 2. overwrite entries in the existing state dict\n    model_dict.update(pretrained_dict)\n    # 3. load the new state dict\n    model.load_state_dict(model_dict)\n    # 4. release the pretrained dict for saving memory\n    pretrained_dict = []\nelse:\n    print(\"*****************************************************************\")\n    print(\"**** We don't load any trained weights **************************\")\n    print(\"*****************************************************************\")\n\nmodel = model.eval() # deploy mode\n\n\nuse_cuda=args.use_cuda\nsave_which=args.save_which\ndtype = args.dtype\nunique_id =str(random.randint(0, 100000))\nprint(\"The unique id for current testing is: \" + str(unique_id))\n\ninterp_error = AverageMeter()\nif DO_MiddleBurryOther:\n    subdir = os.listdir(MB_Other_DATA)\n    gen_dir = os.path.join(MB_Other_RESULT, unique_id)\n    os.mkdir(gen_dir)\n\n    tot_timer = AverageMeter()\n    proc_timer = AverageMeter()\n    end = time.time()\n    for dir in subdir:\n        print(dir)\n        os.mkdir(os.path.join(gen_dir, dir))\n        arguments_strFirst = os.path.join(MB_Other_DATA, dir, \"frame10.png\")\n        arguments_strSecond = os.path.join(MB_Other_DATA, dir, \"frame11.png\")\n        arguments_strOut = os.path.join(gen_dir, dir, \"frame10i11.png\")\n        gt_path = os.path.join(MB_Other_GT, dir, \"frame10i11.png\")\n\n        X0 =  torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype(\"float32\")/ 255.0).type(dtype)\n        X1 =  torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype(\"float32\")/ 255.0).type(dtype)\n\n\n        y_ = torch.FloatTensor()\n\n        assert (X0.size(1) == X1.size(1))\n        assert (X0.size(2) == X1.size(2))\n\n        intWidth = X0.size(2)\n        intHeight = X0.size(1)\n        channel = X0.size(0)\n        if not channel == 3:\n            continue\n\n        if intWidth != ((intWidth >> 7) << 7):\n            intWidth_pad = (((intWidth >> 7) + 1) << 7)  # more than necessary\n            intPaddingLeft =int(( intWidth_pad - intWidth)/2)\n            intPaddingRight = intWidth_pad - intWidth - intPaddingLeft\n        else:\n            intWidth_pad = intWidth\n            intPaddingLeft = 32\n            intPaddingRight= 32\n\n        if intHeight != ((intHeight >> 7) << 7):\n            intHeight_pad = (((intHeight >> 7) + 1) << 7)  # more than necessary\n            intPaddingTop = int((intHeight_pad - intHeight) / 2)\n            intPaddingBottom = intHeight_pad - intHeight - intPaddingTop\n        else:\n            intHeight_pad = intHeight\n            intPaddingTop = 32\n            intPaddingBottom = 32\n\n        pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom])\n\n        torch.set_grad_enabled(False)\n        X0 = Variable(torch.unsqueeze(X0,0))\n        X1 = Variable(torch.unsqueeze(X1,0))\n        X0 = pader(X0)\n        X1 = pader(X1)\n\n        if use_cuda:\n            X0 = X0.cuda()\n            X1 = X1.cuda()\n        proc_end = time.time()\n        y_s,offset,filter = model(torch.stack((X0, X1),dim = 0))\n        y_ = y_s[save_which]\n\n        proc_timer.update(time.time() -proc_end)\n        tot_timer.update(time.time() - end)\n        end  = time.time()\n        print(\"*****************current image process time \\t \" + str(time.time()-proc_end )+\"s ******************\" )\n        if use_cuda:\n            X0 = X0.data.cpu().numpy()\n            y_ = y_.data.cpu().numpy()\n            offset = [offset_i.data.cpu().numpy() for offset_i in offset]\n            filter = [filter_i.data.cpu().numpy() for filter_i in filter]  if filter[0] is not None else None\n            X1 = X1.data.cpu().numpy()\n        else:\n            X0 = X0.data.numpy()\n            y_ = y_.data.numpy()\n            offset = [offset_i.data.numpy() for offset_i in offset]\n            filter = [filter_i.data.numpy() for filter_i in filter]\n            X1 = X1.data.numpy()\n\n\n\n        X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n        y_ = np.transpose(255.0 * y_.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n        offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]\n        filter = [np.transpose(\n            filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],\n            (1, 2, 0)) for filter_i in filter]  if filter is not None else None\n        X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n\n\n        imsave(arguments_strOut, np.round(y_).astype(numpy.uint8))\n\n\n        rec_rgb =  imread(arguments_strOut)\n        gt_rgb = imread(gt_path)\n\n        diff_rgb = 128.0 + rec_rgb - gt_rgb\n        avg_interp_error_abs = np.mean(np.abs(diff_rgb - 128.0))\n\n        interp_error.update(avg_interp_error_abs, 1)\n\n        mse = numpy.mean((diff_rgb - 128.0) ** 2)\n\n        PIXEL_MAX = 255.0\n        psnr = 20 * math.log10(PIXEL_MAX / math.sqrt(mse))\n\n        print(\"interpolation error / PSNR : \" + str(round(avg_interp_error_abs,4)) + \" / \" + str(round(psnr,4)))\n        metrics = \"The average interpolation error / PSNR for all images are : \" + str(round(interp_error.avg, 4))\n        print(metrics)\n\n"
  },
  {
    "path": "demo_MiddleBury_slowmotion.py",
    "content": "import time\nimport os\nfrom torch.autograd import Variable\nimport torch\nimport random\nimport numpy as np\nimport numpy\nimport networks\nfrom my_args import  args\nfrom scipy.misc import imread, imsave\nfrom AverageMeter import  *\nimport shutil\n\ntorch.backends.cudnn.benchmark = True # to speed up the\n\nDO_MiddleBurryOther = True\nMB_Other_DATA = \"./MiddleBurySet/other-data/\"\nMB_Other_RESULT = \"./MiddleBurySet/other-result-author/\"\nMB_Other_GT = \"./MiddleBurySet/other-gt-interp/\"\nif not os.path.exists(MB_Other_RESULT):\n    os.mkdir(MB_Other_RESULT)\n\n\n\nmodel = networks.__dict__[args.netName](    channel=args.channels,\n                                    filter_size = args.filter_size ,\n                                    timestep=args.time_step,\n                                    training=False)\n\nif args.use_cuda:\n    model = model.cuda()\n\nargs.SAVED_MODEL = './model_weights/best.pth'\nif os.path.exists(args.SAVED_MODEL):\n    print(\"The testing model weight is: \" + args.SAVED_MODEL)\n    if not args.use_cuda:\n        pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)\n        # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))\n    else:\n        pretrained_dict = torch.load(args.SAVED_MODEL)\n        # model.load_state_dict(torch.load(args.SAVED_MODEL))\n\n    model_dict = model.state_dict()\n    # 1. filter out unnecessary keys\n    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}\n    # 2. overwrite entries in the existing state dict\n    model_dict.update(pretrained_dict)\n    # 3. load the new state dict\n    model.load_state_dict(model_dict)\n    # 4. release the pretrained dict for saving memory\n    pretrained_dict = []\nelse:\n    print(\"*****************************************************************\")\n    print(\"**** We don't load any trained weights **************************\")\n    print(\"*****************************************************************\")\n\nmodel = model.eval() # deploy mode\n\nuse_cuda=args.use_cuda\nsave_which=args.save_which\ndtype = args.dtype\nunique_id =str(random.randint(0, 100000))\nprint(\"The unique id for current testing is: \" + str(unique_id))\n\ninterp_error = AverageMeter()\nif DO_MiddleBurryOther:\n    subdir = os.listdir(MB_Other_DATA)\n    gen_dir = os.path.join(MB_Other_RESULT, unique_id)\n    os.mkdir(gen_dir)\n\n    tot_timer = AverageMeter()\n    proc_timer = AverageMeter()\n    end = time.time()\n    for dir in subdir: \n        print(dir)\n        os.mkdir(os.path.join(gen_dir, dir))\n        arguments_strFirst = os.path.join(MB_Other_DATA, dir, \"frame10.png\")\n        arguments_strSecond = os.path.join(MB_Other_DATA, dir, \"frame11.png\")\n        gt_path = os.path.join(MB_Other_GT, dir, \"frame10i11.png\")\n\n        X0 =  torch.from_numpy( np.transpose(imread(arguments_strFirst) , (2,0,1)).astype(\"float32\")/ 255.0).type(dtype)\n        X1 =  torch.from_numpy( np.transpose(imread(arguments_strSecond) , (2,0,1)).astype(\"float32\")/ 255.0).type(dtype)\n\n\n        y_ = torch.FloatTensor()\n\n        assert (X0.size(1) == X1.size(1))\n        assert (X0.size(2) == X1.size(2))\n\n        intWidth = X0.size(2)\n        intHeight = X0.size(1)\n        channel = X0.size(0)\n        if not channel == 3:\n            continue\n\n        if intWidth != ((intWidth >> 7) << 7):\n            intWidth_pad = (((intWidth >> 7) + 1) << 7)  # more than necessary\n            intPaddingLeft =int(( intWidth_pad - intWidth)/2)\n            intPaddingRight = intWidth_pad - intWidth - intPaddingLeft\n        else:\n            intWidth_pad = intWidth\n            intPaddingLeft = 32\n            intPaddingRight= 32\n\n        if intHeight != ((intHeight >> 7) << 7):\n            intHeight_pad = (((intHeight >> 7) + 1) << 7)  # more than necessary\n            intPaddingTop = int((intHeight_pad - intHeight) / 2)\n            intPaddingBottom = intHeight_pad - intHeight - intPaddingTop\n        else:\n            intHeight_pad = intHeight\n            intPaddingTop = 32\n            intPaddingBottom = 32\n\n        pader = torch.nn.ReplicationPad2d([intPaddingLeft, intPaddingRight , intPaddingTop, intPaddingBottom])\n\n        torch.set_grad_enabled(False)\n        X0 = Variable(torch.unsqueeze(X0,0))\n        X1 = Variable(torch.unsqueeze(X1,0))\n        X0 = pader(X0)\n        X1 = pader(X1)\n\n        if use_cuda:\n            X0 = X0.cuda()\n            X1 = X1.cuda()\n        proc_end = time.time()\n        y_s,offset,filter = model(torch.stack((X0, X1),dim = 0))\n        y_ = y_s[save_which]\n\n        proc_timer.update(time.time() -proc_end)\n        tot_timer.update(time.time() - end)\n        end  = time.time()\n        print(\"*****************current image process time \\t \" + str(time.time()-proc_end )+\"s ******************\" )\n        if use_cuda:\n            X0 = X0.data.cpu().numpy()\n            if not isinstance(y_, list):\n                y_ = y_.data.cpu().numpy()\n            else:\n                y_ = [item.data.cpu().numpy() for item in y_]\n            offset = [offset_i.data.cpu().numpy() for offset_i in offset]\n            filter = [filter_i.data.cpu().numpy() for filter_i in filter]  if filter[0] is not None else None\n            X1 = X1.data.cpu().numpy()\n        else:\n            X0 = X0.data.numpy()\n            if not isinstance(y_, list):\n                y_ = y_.data.numpy()\n            else:\n                y_ = [item.data.numpy() for item in y_]\n            offset = [offset_i.data.numpy() for offset_i in offset]\n            filter = [filter_i.data.numpy() for filter_i in filter]\n            X1 = X1.data.numpy()\n\n\n\n        X0 = np.transpose(255.0 * X0.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n        y_ = [np.transpose(255.0 * item.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight,\n                                  intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for item in y_]\n        offset = [np.transpose(offset_i[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0)) for offset_i in offset]\n        filter = [np.transpose(\n            filter_i[0, :, intPaddingTop:intPaddingTop + intHeight, intPaddingLeft: intPaddingLeft + intWidth],\n            (1, 2, 0)) for filter_i in filter]  if filter is not None else None\n        X1 = np.transpose(255.0 * X1.clip(0,1.0)[0, :, intPaddingTop:intPaddingTop+intHeight, intPaddingLeft: intPaddingLeft+intWidth], (1, 2, 0))\n\n        timestep = args.time_step\n        numFrames = int(1.0 / timestep) - 1\n        time_offsets = [kk * timestep for kk in range(1, 1 + numFrames, 1)]\n        # for item, time_offset  in zip(y_,time_offsets):\n        #     arguments_strOut = os.path.join(gen_dir, dir, \"frame10_i{:.3f}_11.png\".format(time_offset))\n        #\n        #     imsave(arguments_strOut, np.round(item).astype(numpy.uint8))\n        #\n        # # copy the first and second reference frame\n        # shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir,  \"frame10_i{:.3f}_11.png\".format(0)))\n        # shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir,  \"frame11_i{:.3f}_11.png\".format(1)))\n\n        count = 0\n        shutil.copy(arguments_strFirst, os.path.join(gen_dir, dir, \"{:0>4d}.png\".format(count)))\n        count  = count+1\n        for item, time_offset in zip(y_, time_offsets):\n            arguments_strOut = os.path.join(gen_dir, dir, \"{:0>4d}.png\".format(count))\n            count = count + 1\n            imsave(arguments_strOut, np.round(item).astype(numpy.uint8))\n        shutil.copy(arguments_strSecond, os.path.join(gen_dir, dir, \"{:0>4d}.png\".format(count)))\n        count = count + 1\n\n\n         "
  },
  {
    "path": "environment.yaml",
    "content": "name: pytorch1.0.0\nchannels:\n  - pytorch\n  - serge-sans-paille\n  - anaconda\n  - conda-forge\n  - defaults\ndependencies:\n  - ca-certificates=2019.1.23=0\n  - certifi=2018.11.29=py36_0\n  - cloudpickle=0.7.0=py_0\n  - cytoolz=0.9.0.1=py36h14c3975_1\n  - dask-core=1.1.1=py_0\n  - decorator=4.3.2=py36_0\n  - imageio=2.4.1=py36_0\n  - networkx=2.2=py36_1\n  - openssl=1.1.1=h7b6447c_0\n  - pywavelets=1.0.1=py36hdd07704_0\n  - scikit-image=0.14.1=py36he6710b0_0\n  - scipy=1.1.0=py36h7c811a0_0\n  - toolz=0.9.0=py36_0\n  - cycler=0.10.0=py_1\n  - expat=2.2.5=hf484d3e_1002\n  - fontconfig=2.13.1=h2176d3f_1000\n  - gettext=0.19.8.1=h9745a5d_1001\n  - glib=2.56.2=had28632_1001\n  - icu=58.2=hf484d3e_1000\n  - kiwisolver=1.0.1=py36h6bb024c_1002\n  - libiconv=1.15=h14c3975_1004\n  - libprotobuf=3.6.1=hdbcaa40_1000\n  - libuuid=2.32.1=h14c3975_1000\n  - libxcb=1.13=h14c3975_1002\n  - libxml2=2.9.8=h143f9aa_1005\n  - matplotlib=3.0.2=py36_1002\n  - matplotlib-base=3.0.2=py36h167e16e_1002\n  - protobuf=3.6.1=py36hf484d3e_1001\n  - pthread-stubs=0.4=h14c3975_1001\n  - pyparsing=2.3.1=py_0\n  - pyqt=5.6.0=py36h13b7fb3_1008\n  - python-dateutil=2.8.0=py_0\n  - sip=4.18.1=py36hf484d3e_1000\n  - tensorboardx=1.6=py_0\n  - tk=8.6.9=h84994c4_1000\n  - tornado=5.1.1=py36h14c3975_1000\n  - xorg-libxau=1.0.9=h14c3975_0\n  - xorg-libxdmcp=1.1.2=h14c3975_1007\n  - blas=1.0=mkl\n  - cffi=1.11.5=py36he75722e_1\n  - cudatoolkit=9.0=h13b8566_0\n  - dbus=1.13.2=h714fa37_1\n  - freetype=2.9.1=h8a8886c_1\n  - gst-plugins-base=1.14.0=hbbd80ab_1\n  - gstreamer=1.14.0=hb453b48_1\n  - intel-openmp=2019.1=144\n  - isl=0.12.2=0\n  - jpeg=9b=h024ee3a_2\n  - libedit=3.1.20181209=hc058e9b_0\n  - libffi=3.2.1=hd88cf55_4\n  - libgcc-ng=8.2.0=hdf63c60_1\n  - libgfortran-ng=7.3.0=hdf63c60_0\n  - libpng=1.6.36=hbc83047_0\n  - libstdcxx-ng=8.2.0=hdf63c60_1\n  - libtiff=4.0.10=h2733197_2\n  - mkl=2019.1=144\n  - mkl_fft=1.0.10=py36ha843d7b_0\n  - mkl_random=1.0.2=py36hd81dba3_0\n  - mpc=1.0.3=hf803216_4\n  - mpfr=3.1.5=h12ff648_1\n  - ncurses=6.1=he6710b0_1\n  - ninja=1.8.2=py36h6bb024c_1\n  - numpy=1.15.4=py36h7e9f1db_0\n  - numpy-base=1.15.4=py36hde5b4d6_0\n  - olefile=0.46=py36_0\n  - pcre=8.42=h439df22_0\n  - pillow=5.4.1=py36h34e0f95_0\n  - pip=19.0.1=py36_0\n  - pycparser=2.19=py36_0\n  - python=3.6.8=h0371630_0\n  - qt=5.6.3=h8bf5577_3\n  - readline=7.0=h7b6447c_5\n  - setuptools=40.8.0=py36_0\n  - six=1.12.0=py36_0\n  - sqlite=3.26.0=h7b6447c_0\n  - wheel=0.32.3=py36_0\n  - xz=5.2.4=h14c3975_4\n  - zlib=1.2.11=h7b6447c_3\n  - zstd=1.3.7=h0b5b093_0\n  - pytorch=1.0.1=py3.6_cuda9.0.176_cudnn7.4.2_2\n  - torchvision=0.2.1=py_2\n  - cloog=0.18.1=1\n  - gcc_49=4.9.1=6\n  - gmp=5.1.3=0\n  - pip:\n    - correlation-cuda==0.0.0\n    - dask==1.1.1\n    - depthflowprojection-cuda==0.0.0\n    - filterinterpolation-cuda==0.0.0\n    - flowprojection-cuda==0.0.0\n    - interpolation-cuda==0.0.0\n    - interpolationch-cuda==0.0.0\n    - mindepthflowprojection-cuda==0.0.0\n    - separableconv-cuda==0.0.0\n    - separableconvflow-cuda==0.0.0\n    - torch==1.0.1.post2\nprefix: /home/wenbobao/anaconda3_new/envs/pytorch1.0.0\n\n"
  },
  {
    "path": "loss_function.py",
    "content": "import sys\nimport os\n\nimport sys\nimport  threading\nimport torch\nfrom torch.autograd import Variable\nfrom lr_scheduler import *\nfrom torch.autograd import gradcheck\n\nimport numpy\n\n\n\n\ndef charbonier_loss(x,epsilon):\n    loss = torch.mean(torch.sqrt(x * x + epsilon * epsilon))\n    return loss\ndef negPSNR_loss(x,epsilon):\n    loss = torch.mean(torch.mean(torch.mean(torch.sqrt(x * x + epsilon * epsilon),dim=1),dim=1),dim=1)\n    return torch.mean(-torch.log(1.0/loss) /100.0)\n\ndef tv_loss(x,epsilon):\n    loss = torch.mean( torch.sqrt(\n        (x[:, :, :-1, :-1] - x[:, :, 1:, :-1]) ** 2 +\n        (x[:, :, :-1, :-1] - x[:, :, :-1, 1:]) ** 2 + epsilon *epsilon\n            )\n        )\n    return loss\n\n    \ndef gra_adap_tv_loss(flow, image, epsilon):\n    w = torch.exp( - torch.sum(\ttorch.abs(image[:,:,:-1, :-1] - image[:,:,1:, :-1]) + \n                            torch.abs(image[:,:,:-1, :-1] - image[:,:,:-1, 1:]), dim = 1))\t\t\n    tv = torch.sum(torch.sqrt((flow[:, :, :-1, :-1] - flow[:, :, 1:, :-1]) ** 2 + (flow[:, :, :-1, :-1] - flow[:, :, :-1, 1:]) ** 2 + epsilon *epsilon) ,dim=1)             \n    loss = torch.mean( w * tv )\n    return loss\t\n        \ndef smooth_loss(x,epsilon):\n    loss = torch.mean(\n        torch.sqrt(\n            (x[:,:,:-1,:-1] - x[:,:,1:,:-1]) **2 +\n            (x[:,:,:-1,:-1] - x[:,:,:-1,1:]) **2+ epsilon**2\n        )\n    )\n    return loss\n    \n    \ndef motion_sym_loss(offset, epsilon, occlusion = None):\n    if occlusion == None:\n        # return torch.mean(torch.sqrt( (offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2))\n        return torch.mean(torch.sqrt( (offset[0] + offset[1])**2 + epsilon **2))\n    else:\n        # TODO: how to design the occlusion aware offset symmetric loss?\n        # return torch.mean(torch.sqrt((offset[:,:2,...] + offset[:,2:,...])**2 + epsilon **2))\n        return torch.mean(torch.sqrt((offset[0] + offset[1])**2 + epsilon **2))\n\n\n\n    \ndef part_loss(diffs, offsets, occlusions, images, epsilon, use_negPSNR=False):\n    if use_negPSNR:\n        pixel_loss = [negPSNR_loss(diff, epsilon) for diff in diffs]\n    else:\n        pixel_loss = [charbonier_loss(diff, epsilon) for diff in diffs]\n    #offset_loss = [tv_loss(offset[0], epsilon) + tv_loss(offset[1], epsilon) for offset in\n    #               offsets]\n\n    if offsets[0][0] is not None:\n        offset_loss = [gra_adap_tv_loss(offset[0],images[0], epsilon) + gra_adap_tv_loss(offset[1], images[1], epsilon) for offset in\n                   offsets]\n    else:\n        offset_loss = [Variable(torch.zeros(1).cuda())]\n    # print(torch.max(occlusions[0]))\n    # print(torch.min(occlusions[0]))\n    # print(torch.mean(occlusions[0]))\n\n    # occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion - 0.5, epsilon) for occlusion in occlusions]\n    # occlusion_loss = [smooth_loss(occlusion, epsilon) + charbonier_loss(occlusion[:, 0, ...] - occlusion[:, 1, ...], epsilon) for occlusion in occlusions]\n\n\n\n    sym_loss = [motion_sym_loss(offset,epsilon=epsilon) for offset in offsets]\n    # sym_loss = [ motion_sym_loss(offset,occlusion) for offset,occlusion in zip(offsets,occlusions)]\n    return pixel_loss, offset_loss, sym_loss\n\n"
  },
  {
    "path": "lr_scheduler.py",
    "content": "from bisect import bisect_right\r\nfrom torch.optim.optimizer import Optimizer\r\n\r\n\r\nclass _LRScheduler(object):\r\n    def __init__(self, optimizer, last_epoch=-1):\r\n        if not isinstance(optimizer, Optimizer):\r\n            raise TypeError('{} is not an Optimizer'.format(\r\n                type(optimizer).__name__))\r\n        self.optimizer = optimizer\r\n        if last_epoch == -1:\r\n            for group in optimizer.param_groups:\r\n                group.setdefault('initial_lr', group['lr'])\r\n        else:\r\n            for i, group in enumerate(optimizer.param_groups):\r\n                if 'initial_lr' not in group:\r\n                    raise KeyError(\"param 'initial_lr' is not specified \"\r\n                                   \"in param_groups[{}] when resuming an optimizer\".format(i))\r\n        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))\r\n        self.step(last_epoch + 1)\r\n        self.last_epoch = last_epoch\r\n\r\n    def get_lr(self):\r\n        raise NotImplementedError\r\n\r\n    def step(self, epoch=None):\r\n        if epoch is None:\r\n            epoch = self.last_epoch + 1\r\n        self.last_epoch = epoch\r\n        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):\r\n            param_group['lr'] = lr\r\n\r\n\r\nclass LambdaLR(_LRScheduler):\r\n    \"\"\"Sets the learning rate of each parameter group to the initial lr\r\n    times a given function. When last_epoch=-1, sets initial lr as lr.\r\n\r\n    Args:\r\n        optimizer (Optimizer): Wrapped optimizer.\r\n        lr_lambda (function or list): A function which computes a multiplicative\r\n            factor given an integer parameter epoch, or a list of such\r\n            functions, one for each group in optimizer.param_groups.\r\n        last_epoch (int): The index of last epoch. Default: -1.\r\n\r\n    Example:\r\n        >>> # Assuming optimizer has two groups.\r\n        >>> lambda1 = lambda epoch: epoch // 30\r\n        >>> lambda2 = lambda epoch: 0.95 ** epoch\r\n        >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])\r\n        >>> for epoch in range(100):\r\n        >>>     scheduler.step()\r\n        >>>     train(...)\r\n        >>>     validate(...)\r\n    \"\"\"\r\n    def __init__(self, optimizer, lr_lambda, last_epoch=-1):\r\n        self.optimizer = optimizer\r\n        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):\r\n            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)\r\n        else:\r\n            if len(lr_lambda) != len(optimizer.param_groups):\r\n                raise ValueError(\"Expected {} lr_lambdas, but got {}\".format(\r\n                    len(optimizer.param_groups), len(lr_lambda)))\r\n            self.lr_lambdas = list(lr_lambda)\r\n        self.last_epoch = last_epoch\r\n        super(LambdaLR, self).__init__(optimizer, last_epoch)\r\n\r\n    def get_lr(self):\r\n        return [base_lr * lmbda(self.last_epoch)\r\n                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]\r\n\r\n\r\n\r\nclass StepLR(_LRScheduler):\r\n    \"\"\"Sets the learning rate of each parameter group to the initial lr\r\n    decayed by gamma every step_size epochs. When last_epoch=-1, sets\r\n    initial lr as lr.\r\n\r\n    Args:\r\n        optimizer (Optimizer): Wrapped optimizer.\r\n        step_size (int): Period of learning rate decay.\r\n        gamma (float): Multiplicative factor of learning rate decay.\r\n            Default: 0.1.\r\n        last_epoch (int): The index of last epoch. Default: -1.\r\n\r\n    Example:\r\n        >>> # Assuming optimizer uses lr = 0.5 for all groups\r\n        >>> # lr = 0.05     if epoch < 30\r\n        >>> # lr = 0.005    if 30 <= epoch < 60\r\n        >>> # lr = 0.0005   if 60 <= epoch < 90\r\n        >>> # ...\r\n        >>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1)\r\n        >>> for epoch in range(100):\r\n        >>>     scheduler.step()\r\n        >>>     train(...)\r\n        >>>     validate(...)\r\n    \"\"\"\r\n\r\n    def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):\r\n        self.step_size = step_size\r\n        self.gamma = gamma\r\n        super(StepLR, self).__init__(optimizer, last_epoch)\r\n\r\n    def get_lr(self):\r\n        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)\r\n                for base_lr in self.base_lrs]\r\n\r\n\r\n\r\nclass MultiStepLR(_LRScheduler):\r\n    \"\"\"Set the learning rate of each parameter group to the initial lr decayed\r\n    by gamma once the number of epoch reaches one of the milestones. When\r\n    last_epoch=-1, sets initial lr as lr.\r\n\r\n    Args:\r\n        optimizer (Optimizer): Wrapped optimizer.\r\n        milestones (list): List of epoch indices. Must be increasing.\r\n        gamma (float): Multiplicative factor of learning rate decay.\r\n            Default: 0.1.\r\n        last_epoch (int): The index of last epoch. Default: -1.\r\n\r\n    Example:\r\n        >>> # Assuming optimizer uses lr = 0.5 for all groups\r\n        >>> # lr = 0.05     if epoch < 30\r\n        >>> # lr = 0.005    if 30 <= epoch < 80\r\n        >>> # lr = 0.0005   if epoch >= 80\r\n        >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)\r\n        >>> for epoch in range(100):\r\n        >>>     scheduler.step()\r\n        >>>     train(...)\r\n        >>>     validate(...)\r\n    \"\"\"\r\n\r\n    def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):\r\n        if not list(milestones) == sorted(milestones):\r\n            raise ValueError('Milestones should be a list of'\r\n                             ' increasing integers. Got {}', milestones)\r\n        self.milestones = milestones\r\n        self.gamma = gamma\r\n        super(MultiStepLR, self).__init__(optimizer, last_epoch)\r\n\r\n    def get_lr(self):\r\n        return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)\r\n                for base_lr in self.base_lrs]\r\n\r\n\r\n\r\nclass ExponentialLR(_LRScheduler):\r\n    \"\"\"Set the learning rate of each parameter group to the initial lr decayed\r\n    by gamma every epoch. When last_epoch=-1, sets initial lr as lr.\r\n\r\n    Args:\r\n        optimizer (Optimizer): Wrapped optimizer.\r\n        gamma (float): Multiplicative factor of learning rate decay.\r\n        last_epoch (int): The index of last epoch. Default: -1.\r\n    \"\"\"\r\n\r\n    def __init__(self, optimizer, gamma, last_epoch=-1):\r\n        self.gamma = gamma\r\n        super(ExponentialLR, self).__init__(optimizer, last_epoch)\r\n\r\n    def get_lr(self):\r\n        return [base_lr * self.gamma ** self.last_epoch\r\n                for base_lr in self.base_lrs]\r\n\r\n\r\n\r\nclass ReduceLROnPlateau(object):\r\n    \"\"\"Reduce learning rate when a metric has stopped improving.\r\n    Models often benefit from reducing the learning rate by a factor\r\n    of 2-10 once learning stagnates. This scheduler reads a metrics\r\n    quantity and if no improvement is seen for a 'patience' number\r\n    of epochs, the learning rate is reduced.\r\n\r\n    Args:\r\n        optimizer (Optimizer): Wrapped optimizer.\r\n        mode (str): One of `min`, `max`. In `min` mode, lr will\r\n            be reduced when the quantity monitored has stopped\r\n            decreasing; in `max` mode it will be reduced when the\r\n            quantity monitored has stopped increasing. Default: 'min'.\r\n        factor (float): Factor by which the learning rate will be\r\n            reduced. new_lr = lr * factor. Default: 0.1.\r\n        patience (int): Number of epochs with no improvement after\r\n            which learning rate will be reduced. Default: 10.\r\n        verbose (bool): If True, prints a message to stdout for\r\n            each update. Default: False.\r\n        threshold (float): Threshold for measuring the new optimum,\r\n            to only focus on significant changes. Default: 1e-4.\r\n        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,\r\n            dynamic_threshold = best * ( 1 + threshold ) in 'max'\r\n            mode or best * ( 1 - threshold ) in `min` mode.\r\n            In `abs` mode, dynamic_threshold = best + threshold in\r\n            `max` mode or best - threshold in `min` mode. Default: 'rel'.\r\n        cooldown (int): Number of epochs to wait before resuming\r\n            normal operation after lr has been reduced. Default: 0.\r\n        min_lr (float or list): A scalar or a list of scalars. A\r\n            lower bound on the learning rate of all param groups\r\n            or each group respectively. Default: 0.\r\n        eps (float): Minimal decay applied to lr. If the difference\r\n            between new and old lr is smaller than eps, the update is\r\n            ignored. Default: 1e-8.\r\n\r\n    Example:\r\n        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\r\n        >>> scheduler = ReduceLROnPlateau(optimizer, 'min')\r\n        >>> for epoch in range(10):\r\n        >>>     train(...)\r\n        >>>     val_loss = validate(...)\r\n        >>>     # Note that step should be called after validate()\r\n        >>>     scheduler.step(val_loss)\r\n    \"\"\"\r\n\r\n    def __init__(self, optimizer, mode='min', factor=0.1, patience=10,\r\n                 verbose=False, threshold=1e-4, threshold_mode='rel',\r\n                 cooldown=0, min_lr=0, eps=1e-8):\r\n\r\n        if factor >= 1.0:\r\n            raise ValueError('Factor should be < 1.0.')\r\n        self.factor = factor\r\n\r\n        if not isinstance(optimizer, Optimizer):\r\n            raise TypeError('{} is not an Optimizer'.format(\r\n                type(optimizer).__name__))\r\n        self.optimizer = optimizer\r\n\r\n        if isinstance(min_lr, list) or isinstance(min_lr, tuple):\r\n            if len(min_lr) != len(optimizer.param_groups):\r\n                raise ValueError(\"expected {} min_lrs, got {}\".format(\r\n                    len(optimizer.param_groups), len(min_lr)))\r\n            self.min_lrs = list(min_lr)\r\n        else:\r\n            self.min_lrs = [min_lr] * len(optimizer.param_groups)\r\n\r\n        self.patience = patience\r\n        self.verbose = verbose\r\n        self.cooldown = cooldown\r\n        self.cooldown_counter = 0\r\n        self.mode = mode\r\n        self.threshold = threshold\r\n        self.threshold_mode = threshold_mode\r\n        self.best = None\r\n        self.num_bad_epochs = None\r\n        self.mode_worse = None  # the worse value for the chosen mode\r\n        self.is_better = None\r\n        self.eps = eps\r\n        self.last_epoch = -1\r\n        self._init_is_better(mode=mode, threshold=threshold,\r\n                             threshold_mode=threshold_mode)\r\n        self._reset()\r\n\r\n    def _reset(self):\r\n        \"\"\"Resets num_bad_epochs counter and cooldown counter.\"\"\"\r\n        self.best = self.mode_worse\r\n        self.cooldown_counter = 0\r\n        self.num_bad_epochs = 0\r\n\r\n    def step(self, metrics, epoch=None):\r\n        current = metrics\r\n        if epoch is None:\r\n            epoch = self.last_epoch = self.last_epoch + 1\r\n        self.last_epoch = epoch\r\n\r\n        if self.is_better(current, self.best):\r\n            self.best = current\r\n            self.num_bad_epochs = 0\r\n        else:\r\n            self.num_bad_epochs += 1\r\n\r\n        if self.in_cooldown:\r\n            self.cooldown_counter -= 1\r\n            self.num_bad_epochs = 0  # ignore any bad epochs in cooldown\r\n\r\n        if self.num_bad_epochs > self.patience:\r\n            self._reduce_lr(epoch)\r\n            self.cooldown_counter = self.cooldown\r\n            self.num_bad_epochs = 0\r\n\r\n    def _reduce_lr(self, epoch):\r\n        for i, param_group in enumerate(self.optimizer.param_groups):\r\n            old_lr = float(param_group['lr'])\r\n            new_lr = max(old_lr * self.factor, self.min_lrs[i])\r\n            if old_lr - new_lr > self.eps:\r\n                param_group['lr'] = new_lr\r\n                if self.verbose:\r\n                    print('Epoch {:5d}: reducing learning rate'\r\n                          ' of group {} to {:.4e}.'.format(epoch, i, new_lr))\r\n\r\n    @property\r\n    def in_cooldown(self):\r\n        return self.cooldown_counter > 0\r\n\r\n    def _init_is_better(self, mode, threshold, threshold_mode):\r\n        if mode not in {'min', 'max'}:\r\n            raise ValueError('mode ' + mode + ' is unknown!')\r\n        if threshold_mode not in {'rel', 'abs'}:\r\n            raise ValueError('threshold mode ' + mode + ' is unknown!')\r\n        if mode == 'min' and threshold_mode == 'rel':\r\n            rel_epsilon = 1. - threshold\r\n            self.is_better = lambda a, best: a < best * rel_epsilon\r\n            self.mode_worse = float('Inf')\r\n        elif mode == 'min' and threshold_mode == 'abs':\r\n            self.is_better = lambda a, best: a < best - threshold\r\n            self.mode_worse = float('Inf')\r\n        elif mode == 'max' and threshold_mode == 'rel':\r\n            rel_epsilon = threshold + 1.\r\n            self.is_better = lambda a, best: a > best * rel_epsilon\r\n            self.mode_worse = -float('Inf')\r\n        else:  # mode == 'max' and epsilon_mode == 'abs':\r\n            self.is_better = lambda a, best: a > best + threshold\r\n            self.mode_worse = -float('Inf')"
  },
  {
    "path": "my_args.py",
    "content": "import os\nimport datetime\nimport argparse\nimport numpy\nimport networks\nimport  torch\nmodelnames =  networks.__all__\n# import datasets\ndatasetNames = ('Vimeo_90K_interp') #datasets.__all__\n\nparser = argparse.ArgumentParser(description='DAIN')\n\nparser.add_argument('--debug',action = 'store_true', help='Enable debug mode')\nparser.add_argument('--netName', type=str, default='DAIN',\n                    choices = modelnames,help = 'model architecture: ' +\n                        ' | '.join(modelnames) +\n                        ' (default: DAIN)')\n\nparser.add_argument('--datasetName', default='Vimeo_90K_interp',\n                    choices= datasetNames,nargs='+',\n                    help='dataset type : ' +\n                        ' | '.join(datasetNames) +\n                        ' (default: Vimeo_90K_interp)')\nparser.add_argument('--datasetPath',default='',help = 'the path of selected datasets')\nparser.add_argument('--dataset_split', type = int, default=97, help = 'Split a dataset into trainining and validation by percentage (default: 97)')\n\nparser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')\n\nparser.add_argument('--numEpoch', '-e', type = int, default=100, help= 'Number of epochs to train(default:150)')\n\nparser.add_argument('--batch_size', '-b',type = int ,default=1, help = 'batch size (default:1)' )\nparser.add_argument('--workers', '-w', type =int,default=8, help = 'parallel workers for loading training samples (default : 1.6*10 = 16)')\nparser.add_argument('--channels', '-c', type=int,default=3,choices = [1,3], help ='channels of images (default:3)')\nparser.add_argument('--filter_size', '-f', type=int, default=4, help = 'the size of filters used (default: 4)',\n                    choices=[2,4,6, 5,51]\n                    )\n\n\nparser.add_argument('--lr', type =float, default= 0.002, help= 'the basic learning rate for three subnetworks (default: 0.002)')\nparser.add_argument('--rectify_lr', type=float, default=0.001, help  = 'the learning rate for rectify/refine subnetworks (default: 0.001)')\n\nparser.add_argument('--save_which', '-s', type=int, default=1, choices=[0,1], help='choose which result to save: 0 ==> interpolated, 1==> rectified')\nparser.add_argument('--time_step',  type=float, default=0.5, help='choose the time steps')\nparser.add_argument('--flow_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')\nparser.add_argument('--occ_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')\nparser.add_argument('--filter_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')\nparser.add_argument('--ctx_lr_coe', type = float, default=1.0, help = 'relative learning rate w.r.t basic learning rate (default: 1.0)')\nparser.add_argument('--depth_lr_coe', type = float, default=0.001, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')\n# parser.add_argument('--deblur_lr_coe', type = float, default=0.01, help = 'relative learning rate w.r.t basic learning rate (default: 0.01)')\n\nparser.add_argument('--alpha', type=float,nargs='+', default=[0.0, 1.0], help= 'the ration of loss for interpolated and rectified result (default: [0.0, 1.0])')\n\nparser.add_argument('--epsilon', type = float, default=1e-6, help = 'the epsilon for charbonier loss,etc (default: 1e-6)')\nparser.add_argument('--weight_decay', type = float, default=0, help = 'the weight decay for whole network ' )\nparser.add_argument('--patience', type=int, default=5, help = 'the patience of reduce on plateou')\nparser.add_argument('--factor', type = float, default=0.2, help = 'the factor of reduce on plateou')\n#\nparser.add_argument('--pretrained', dest='SAVED_MODEL', default=None, help ='path to the pretrained model weights')\nparser.add_argument('--no-date', action='store_true', help='don\\'t append date timestamp to folder' )\nparser.add_argument('--use_cuda', default= True, type = bool, help='use cuda or not')\nparser.add_argument('--use_cudnn',default=1,type=int, help = 'use cudnn or not')\nparser.add_argument('--dtype', default=torch.cuda.FloatTensor, choices = [torch.cuda.FloatTensor,torch.FloatTensor],help = 'tensor data type ')\n# parser.add_argument('--resume', default='', type=str, help='path to latest checkpoint (default: none)')\n\n\nparser.add_argument('--uid', type=str, default= None, help='unique id for the training')\nparser.add_argument('--force', action='store_true', help='force to override the given uid')\n\n# Colab version\nparser.add_argument('--start_frame', type = int, default = 1, help='first frame number to process')\nparser.add_argument('--end_frame', type = int, default = 100, help='last frame number to process')\nparser.add_argument('--frame_input_dir', type = str, default = '/content/DAIN/input_frames', help='frame input directory')\nparser.add_argument('--frame_output_dir', type = str, default = '/content/DAIN/output_frames', help='frame output directory')\n\nargs = parser.parse_args()\n\nimport shutil\n\nif args.uid == None:\n    unique_id = str(numpy.random.randint(0, 100000))\n    print(\"revise the unique id to a random numer \" + str(unique_id))\n    args.uid = unique_id\n    timestamp = datetime.datetime.now().strftime(\"%a-%b-%d-%H-%M\")\n    save_path = './model_weights/'+ args.uid  +'-' + timestamp\nelse:\n    save_path = './model_weights/'+ str(args.uid)\n\n# print(\"no pth here : \" + save_path + \"/best\"+\".pth\")\nif not os.path.exists(save_path + \"/best\"+\".pth\"):\n    # print(\"no pth here : \" + save_path + \"/best\" + \".pth\")\n    os.makedirs(save_path,exist_ok=True)\nelse:\n    if not args.force:\n        raise(\"please use another uid \")\n    else:\n        print(\"override this uid\" + args.uid)\n        for m in range(1,10):\n            if not os.path.exists(save_path+\"/log.txt.bk\" + str(m)):\n                shutil.copy(save_path+\"/log.txt\", save_path+\"/log.txt.bk\"+str(m))\n                shutil.copy(save_path+\"/args.txt\", save_path+\"/args.txt.bk\"+str(m))\n                break\n\n\n\nparser.add_argument('--save_path',default=save_path,help = 'the output dir of weights')\nparser.add_argument('--log', default = save_path+'/log.txt', help = 'the log file in training')\nparser.add_argument('--arg', default = save_path+'/args.txt', help = 'the args used')\n\nargs = parser.parse_args()\n\n\nwith open(args.log, 'w') as f:\n    f.close()\nwith open(args.arg, 'w') as f:\n    print(args)\n    print(args,file=f)\n    f.close()\nif args.use_cudnn:\n    print(\"cudnn is used\")\n    torch.backends.cudnn.benchmark = True  # to speed up the\nelse:\n    print(\"cudnn is not used\")\n    torch.backends.cudnn.benchmark = False  # to speed up the\n\n"
  },
  {
    "path": "my_package/DepthFlowProjection/DepthFlowProjectionLayer.py",
    "content": "# this is for wrapping the customized layer\nimport torch\nfrom torch.autograd import Function\n#import _ext.my_lib as my_lib\nimport depthflowprojection_cuda as my_lib\n\nclass DepthFlowProjectionLayer(Function):\n    def __init__(self,requires_grad):\n        super(DepthFlowProjectionLayer,self).__init__()\n        # self.requires_grad = requires_grad\n\n    @staticmethod\n    def forward(ctx, input1, input2, requires_grad):\n        # print(\"Depth Aware Flow Projection\")\n        assert(input1.is_contiguous())\n        assert(input2.is_contiguous())\n        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\n        # self.input2 = input2.contiguous()\n        fillhole = 1 if requires_grad == False else 0\n        # if input1.is_cuda:\n        #     self.device = torch.cuda.current_device()\n        # else:\n        #     self.device = -1\n\n        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections\n        # output = torch.zeros(input1.size())\n\n        if input1.is_cuda:\n            # output = output.cuda()\n            # count = count.cuda()\n            # print(\"correct\")\n            count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()\n            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.DepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()\n            output = torch.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.DepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole)\n        if err != 0:\n            print(err)\n        # output = output/count # to divide the counter\n\n        # self.count = count #to keep this\n        # self.output = output\n\n        ctx.save_for_backward(input1, input2,count,output)\n        ctx.fillhole = fillhole\n\n        # print(self.input1[0, 0, :10, :10])\n        # print(self.count[0, 0, :10, :10])\n        # print(self.input1[0, 0, -10:, -10:])\n        # print(self.count[0, 0, -10:, -10:])\n\n        # the function returns the output to its caller\n        return output\n\n    @staticmethod\n    def backward(ctx, gradoutput):\n        # print(\"Backward of Filter Interpolation Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        # gradinput1 = torch.zeros(self.input1.size())\n\n        input1, input2, count, output = ctx.saved_tensors\n        # fillhole = ctx.fillhole\n\n        if input1.is_cuda:\n            # print(\"CUDA backward\")\n            # gradinput1 = gradinput1.cuda(self.device)\n            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()\n\n            err = my_lib.DepthFlowProjectionLayer_gpu_backward(input1,input2,\n                                                               count, output,\n                                                               gradoutput, gradinput1,gradinput2)\n            # print(err)\n            if err != 0 :\n                print(err)\n\n        else:\n            # print(\"CPU backward\")\n            # print(gradoutput)\n            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()\n            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()\n            err = my_lib.DepthFlowProjectionLayer_cpu_backward(input1, input2,\n                                                               count, output,\n                                                               gradoutput, gradinput1,gradinput2)\n            # print(err)\n            if err != 0:\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n\n        # print(gradinput1)\n\n        return gradinput1,gradinput2,None\n"
  },
  {
    "path": "my_package/DepthFlowProjection/DepthFlowProjectionModule.py",
    "content": "# modules/FlowProjectionModule.py\nfrom torch.nn.modules.module import Module\nfrom .DepthFlowProjectionLayer import DepthFlowProjectionLayer #, FlowFillholeLayer\n\n__all__ =['DepthFlowProjectionModule']\n\nclass DepthFlowProjectionModule(Module):\n    def __init__(self, requires_grad = True):\n        super(DepthFlowProjectionModule, self).__init__()\n        self.requires_grad = requires_grad\n        # self.f = DepthFlowProjectionLayer(requires_grad)\n\n    def forward(self, input1, input2):\n        return DepthFlowProjectionLayer.apply(input1, input2,self.requires_grad)\n\n# class FlowFillholeModule(Module):\n#     def __init__(self,hole_value = -10000.0):\n#         super(FlowFillholeModule, self).__init__()\n#         self.f = FlowFillholeLayer()\n#\n#     def forward(self, input1):\n#         return self.f(input1)\n\n    #we actually dont need to write the backward code for a module, since we have\n\n"
  },
  {
    "path": "my_package/DepthFlowProjection/__init__.py",
    "content": "from  .DepthFlowProjectionModule import *\n"
  },
  {
    "path": "my_package/DepthFlowProjection/depthflowprojection_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"depthflowprojection_cuda_kernel.cuh\"\r\n\r\n\r\nint DepthFlowProjectionLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n        at::Tensor&  input2,\r\n        at::Tensor&  count,\r\n\t\tat::Tensor&  output,\r\n\t\tint fillhole\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n\tif(channel!= 2) return error;\r\n\tint batch = input1.size(0);\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\r\n    if(input2.size(1) !=1 ) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n\tint count_b_stride = count.stride(0);\r\n\tint count_c_stride = count.stride(1);\r\n\tint count_h_stride = count.stride(2);\r\n\tint count_w_stride = count.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != output.stride(0)) return error;\r\n\tif(input1_c_stride != output.stride(1)) return error;\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n//    printf(\"In gpu forward\\n\");\r\n\terror = DepthFlowProjection_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch,fillhole,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tcount,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\nint DepthFlowProjectionLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n        at::Tensor&  count,\r\n\t\tat::Tensor&  output,\r\n        at::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=2) return error;\r\n\tint batch = input1.size(0);\r\n\tif(count.size( 0) != batch) return error;\r\n\tif(count.size(1) != 1) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n    if(input2.size(1) !=1 ) return error;\r\n    if(count.size(2) != h) return error;// to add some checkpoint\r\n\tif(count.size(3) != w) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n\tint count_b_stride = count.stride(0);\r\n\tint count_c_stride = count.stride(1);\r\n\tint count_h_stride = count.stride(2);\r\n\tint count_w_stride = count.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", count_b_stride,count_c_stride,count_h_stride,count_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = DepthFlowProjection_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1,\r\n            input2,\r\n            count,\r\n            output,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\t  //printf(\"Am I good in backward function %d\",error);\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"DepthFlowProjectionLayer_gpu_forward\", &DepthFlowProjectionLayer_gpu_forward, \"DepthFlowProjection forward (CUDA)\");\r\n  m.def(\"DepthFlowProjectionLayer_gpu_backward\", &DepthFlowProjectionLayer_gpu_backward, \"DepthFlowProjection backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"depthflowprojection_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void DepthFlowProjection_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t* count,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n        float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n        float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n\r\n        float x2 = (float) (w_i) + fx;\r\n        float y2 = (float) (h_i) + fy;\r\n        if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){\r\n            int ix2_L = (int) (x2);\r\n            int iy2_T = (int) (y2);\r\n            int ix2_R = min(ix2_L + 1, w - 1);\r\n            int iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];\r\n\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,- temp * fx);\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-temp * fx);\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-temp * fx);\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-temp * fx);\r\n\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -temp * fy);\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]  , -temp * fy);\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]  , -temp * fy);\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]  , -temp * fy);\r\n\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], temp * 1);\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ,temp *  1);\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , temp * 1);\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] ,temp *  1);\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void DepthFlowProjectionAveraging_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t*  count,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\t    float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;\r\n        if(temp > 0.0f){\r\n            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;\r\n            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void DepthFlowFillhole_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t*  count,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\t    float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;\r\n        if(temp <= 0.0f){\r\n            //search along the four directions,0/90/180/270, until finding at least one\r\n            int left_offset = w_i;            float left_temp = 0.0f;\r\n            while(left_temp == 0.0f && left_offset - 1 >= 0){\r\n                left_offset = left_offset - 1;\r\n                left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;\r\n            }\r\n\r\n            int right_offset = w_i ;            float right_temp = 0.0f;\r\n            while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){\r\n                right_offset  = right_offset + 1 ;\r\n                right_temp =  count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;\r\n            }\r\n\r\n            int up_offset = h_i ;            float up_temp = 0.0f;\r\n            while(up_temp == 0.0f && up_offset - 1 >=0){\r\n                up_offset = up_offset - 1;\r\n                up_temp =  count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;\r\n            }\r\n\r\n            int down_offset = h_i;            float down_temp = 0.0f;\r\n            while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){\r\n                down_offset = down_offset + 1;\r\n                down_temp =  count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;\r\n            }\r\n\r\n            if(left_temp + right_temp + up_temp + down_temp <=0.0f){\r\n                //printf(\"Can't fill hole, find no neighbor vectors availabel\\n\");\r\n                return;\r\n            }\r\n\r\n            left_temp = (left_temp > 0.0f)?1:0;\r\n            right_temp = (right_temp > 0.0f)?1:0;\r\n            up_temp = (up_temp > 0.0f)?1:0;\r\n            down_temp = (down_temp > 0.0f)?1:0;\r\n\r\n            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (\r\n                left_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +\r\n                right_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+\r\n                up_temp *  output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +\r\n                down_temp *  output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]\r\n            )/(\r\n                left_temp + right_temp + up_temp + down_temp\r\n            ) ;\r\n\r\n\r\n            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(\r\n                left_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +\r\n                right_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+\r\n                up_temp *  output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +\r\n                down_temp *  output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]\r\n            )/(\r\n                left_temp + right_temp + up_temp + down_temp\r\n            ) ;\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void DepthFlowProjection_gpu_backward_kernelfunc(\r\n\t\tconst int nElement,  \tconst int w, \tconst int h, const int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t*  count,\r\n\t\tscalar_t* output,\r\n\t\tconst scalar_t* __restrict__  gradoutput,\r\n\t\tscalar_t*  gradinput1,\r\n\t\tscalar_t*  gradinput2\r\n\t\t)\r\n{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off  = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n        float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;\r\n        float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;\r\n\r\n        float x2 = (float) ( w_i ) + fx;\r\n        float y2 = (float) ( h_i ) + fy;\r\n        if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){\r\n            int ix2_L = (int)(x2);\r\n            int iy2_T = (int)(y2);\r\n            int ix2_R  = min(ix2_L + 1, w-1);\r\n            int iy2_B  = min(iy2_T + 1, h-1);\r\n\r\n            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];\r\n\r\n            int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;\r\n            gradinput1[iu_offset] += -  gradoutput[off +  0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] * temp /\r\n                                        count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L]  ;\r\n            gradinput1[iu_offset] += -    gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]  * temp /\r\n                                         count[batch_i * count_b_stride +0 + iy2_T * count_h_stride  + ix2_R]  ;\r\n            gradinput1[iu_offset ] += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp /\r\n                                         count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ;\r\n            gradinput1[iu_offset ]  += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp /\r\n                                         count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ;\r\n\r\n            int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;\r\n            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]  * temp /\r\n                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L]  ;\r\n            gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] * temp /\r\n                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]  ;\r\n            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] * temp /\r\n                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]     ;\r\n            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] * temp /\r\n                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R]   ;\r\n\r\n\r\n            int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i;\r\n            gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *\r\n                                            (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );\r\n            gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *\r\n                                            (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );\r\n            gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *\r\n                                            (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );\r\n            gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *\r\n                                            (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );\r\n\r\n            gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *\r\n                                            (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );\r\n            gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *\r\n                                            (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );\r\n            gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *\r\n                                            (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );\r\n            gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /\r\n                                            count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *\r\n                                            (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\n\r\nint DepthFlowProjection_gpu_forward_kernel(\r\n\t\tcudaStream_t stream, \t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int fillhole,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\tat::Tensor&  input2,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n{\r\n    int error = -1;\r\n\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n//    printf(\"I am here\\n\");\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_forward\", ([&] {\r\n\r\n\tDepthFlowProjection_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n\t\t\t);\r\n\t\t\t\r\n\t}));\r\n\t\t\t\r\n    cudaError_t err = cudaGetLastError();\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"I am there\\n\");\r\n\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjectionAveraging\", ([&] {\r\n\r\n    DepthFlowProjectionAveraging_kernelfunc<<<grid,block,0,stream>>>(\r\n    \t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n    );\r\n\t\t}));\r\n\r\n//    printf(\"I am kao\\n\");\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n    err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"I am dd\\n\");\r\n\r\n    if(fillhole){\r\n\r\n//        printf(\"use flow fill hole\\n\");\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowFillhole\", ([&] {\r\n\r\n        DepthFlowFillhole_kernelfunc<<<grid,block,0,stream>>>(\r\n    \t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n        );\r\n\t\t}));\r\n\r\n    err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\treturn error;\r\n\t}\r\n\r\n    }\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\nint DepthFlowProjection_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  count,        at::Tensor&  output,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n{\r\n\r\n\tint error = -1;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\n\r\n\tDepthFlowProjection_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>(),\r\n\t\t\tgradoutput.data<scalar_t>(), gradinput1.data<scalar_t>(), gradinput2.data<scalar_t>()\r\n\t\t\t);\r\n\t\t\t\t\t}));\r\n\r\n//    printf(\"gpu I am there\\n\");\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpu error in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"gpu I am here\\n\");\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n\r\n}"
  },
  {
    "path": "my_package/DepthFlowProjection/depthflowprojection_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint DepthFlowProjection_gpu_forward_kernel(\r\n\t\tcudaStream_t stream, \t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int fillhole,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  output\r\n\r\n\t\t);\r\n\r\nint DepthFlowProjection_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n        at::Tensor&  count,\r\n        at::Tensor&  output,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t);\r\n"
  },
  {
    "path": "my_package/DepthFlowProjection/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='depthflowprojection_cuda',\n    ext_modules=[\n        CUDAExtension('depthflowprojection_cuda', [\n            'depthflowprojection_cuda.cc',\n            'depthflowprojection_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/FilterInterpolation/FilterInterpolationLayer.py",
    "content": "# this is for wrapping the customized layer\nimport torch\nfrom torch.autograd import Function\nimport filterinterpolation_cuda as my_lib\n\n#Please check how the STN FUNCTION is written :\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py\n\nclass FilterInterpolationLayer(Function):\n    def __init__(self):\n        super(FilterInterpolationLayer,self).__init__()\n    @staticmethod\n    def forward(ctx, input1,input2,input3):\n\n        assert(input1.is_contiguous())\n        assert(input2.is_contiguous())\n        assert (input3.is_contiguous())\n        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\n        # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?\n        # self.input3 = input3.contiguous()\n\n        # if input1.is_cuda:\n        #     self.device = torch.cuda.current_device()\n        # else:\n        #     self.device = -1\n\n        # output =  torch.zeros(input1.size())\n\n\n        if input1.is_cuda :\n            # output = output.cuda()\n            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            my_lib.FilterInterpolationLayer_gpu_forward(input1, input2, input3, output)\n        else:\n            output = torch.FloatTensor(input1.data.size())\n            my_lib.FilterInterpolationLayer_cpu_forward(input1, input2, input3, output)\n\n        ctx.save_for_backward(input1, input2,input3)\n        # the function returns the output to its caller\n        return output\n\n    @staticmethod\n    def backward(ctx, gradoutput):\n        # print(\"Backward of Filter Interpolation Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        # gradinput1 = torch.zeros(self.input1.size())\n        # gradinput2 = torch.zeros(self.input2.size())\n        # gradinput3 = torch.zeros(self.input3.size())\n\n        input1, input2, input3= ctx.saved_tensors\n\n        gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n        gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()\n        gradinput3 = torch.cuda.FloatTensor().resize_(input3.size()).zero_()\n        if input1.is_cuda:\n            # print(\"CUDA backward\")\n            # gradinput1 = gradinput1.cuda(self.device)\n            # gradinput2 = gradinput2.cuda(self.device)\n            # gradinput3 = gradinput3.cuda(self.device)\n\n            err = my_lib.FilterInterpolationLayer_gpu_backward(input1,input2, input3, gradoutput, gradinput1, gradinput2, gradinput3)\n            if err != 0 :\n                print(err)\n\n        else:\n            # print(\"CPU backward\")\n            # print(gradoutput)\n            err = my_lib.FilterInterpolationLayer_cpu_backward(input1, input2, input3, gradoutput, gradinput1, gradinput2, gradinput3)\n            # print(err)\n            if err != 0 :\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n\n        # print(gradinput1)\n\n        return gradinput1, gradinput2,gradinput3\n\n# calculate the weights of flow         \nclass WeightLayer(Function):\n    def __init__(self, lambda_e = 10.0/255.0, lambda_v = 1.0, Nw = 3):\n        #lambda_e = 10.0 , lambda_v = 1.0,  Nw = 3,\n        super(WeightLayer,self).__init__()\n        self.lambda_e = lambda_e\n        self.lambda_v = lambda_v\n        self.Nw = Nw\n\n    # flow1_grad\n    def forward(self, input1,input2,input3):\n\n        # assert(input1.is_contiguous())\n        # assert(input2.is_contiguous())\n        self.input1 = input1.contiguous() # ref1 image\n        self.input2 = input2.contiguous() # ref2 image\n        self.input3 = input3.contiguous()\n        # self.flow1_grad = flow1_grad.contiguous() # ref1 flow's grad\n\n        if input1.is_cuda:\n            self.device = torch.cuda.current_device()\n        else:\n            self.device = -1\n\n        output =  torch.zeros(input1.size(0), 1 , input1.size(2), input1.size(3))\n\n        if input1.is_cuda :\n            output = output.cuda()\n            err = my_lib.WeightLayer_gpu_forward(input1, input2, input3,\n                                                 # flow1_grad,\n                                                 output,\n                 self.lambda_e,  self.lambda_v, self.Nw\n            )\n            if err != 0 :\n                print(err)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            err = my_lib.WeightLayer_cpu_forward(input1, input2, input3,  output,\n                 self.lambda_e ,  self.lambda_v, self.Nw\n            )\n            if err != 0 :\n                print(err)\n\n        self.output = output # save this for fast back propagation\n        #  the function returns the output to its caller\n        return output\n\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\n    def backward(self, gradoutput):\n        # print(\"Backward of WeightLayer Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        gradinput1 = torch.zeros(self.input1.size())\n        gradinput2 = torch.zeros(self.input2.size())\n        gradinput3 = torch.zeros(self.input3.size())\n        # gradflow1_grad = torch.zeros(self.flow1_grad.size())\n        if self.input1.is_cuda:\n            #print(\"CUDA backward\")\n            gradinput1 = gradinput1.cuda(self.device)\n            gradinput2 = gradinput2.cuda(self.device)\n            gradinput3 = gradinput3.cuda(self.device)\n            # gradflow1_grad = gradflow1_grad.cuda(self.device)\n\n            err = my_lib.WeightLayer_gpu_backward(\n                self.input1,self.input2,self.input3, self.output,\n                gradoutput,\n                gradinput1, gradinput2, gradinput3,\n                self.lambda_e,  self.lambda_v, self.Nw\n            )\n            if err != 0 :\n                print(err)\n\n        else:\n            #print(\"CPU backward\")\n            # print(gradoutput)\n            err = my_lib.WeightLayer_cpu_backward(\n                    self.input1, self.input2,self.input3, self.output,\n                gradoutput,\n                gradinput1, gradinput2, gradinput3,\n                self.lambda_e, self.lambda_v, self.Nw\n                )\n            # print(err)\n            if err != 0 :\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n        # print(\"from 1:\")\n        # print(gradinput3[0,0,...])\n\n        return gradinput1, gradinput2, gradinput3\n  \nclass PixelValueLayer(Function):\n    def __init__(self, sigma_d = 3, tao_r = 0.05, Prowindow = 2 ):\n        super(PixelValueLayer,self).__init__()\n     \n        self.sigma_d = sigma_d\n        self.tao_r = tao_r #maybe not useable\n        self.Prowindow = Prowindow\n\n    def forward(self, input1, input3, flow_weights):\n\n        # assert(input1.is_contiguous())\n        # assert(input2.is_contiguous())\n        self.input1 = input1.contiguous() # ref1 image\n        #self.input2 = input2.contiguous() # ref2 image\n        self.input3 = input3.contiguous() # ref1 flow\n        self.flow_weights = flow_weights.contiguous() # ref1 flow weights\n\n        if input1.is_cuda:\n            self.device = torch.cuda.current_device()\n        else:\n            self.device = -1\n\n        output = torch.zeros(input1.size())\n        \n\n        if input1.is_cuda:\n            output = output.cuda()            \n            err = my_lib.PixelValueLayer_gpu_forward(\n                input1,  input3, flow_weights,   output,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            err = my_lib.PixelValueLayer_cpu_forward(\n                input1,  input3, flow_weights, output,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n\n        # the function returns the output to its caller\n        return output\n\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\n    def backward(self, gradoutput):\n        # print(\"Backward of PixelValueLayer Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        gradinput1 = torch.zeros(self.input1.size())\n        #gradinput2 = torch.zeros(self.input2.size())\n        gradinput3 = torch.zeros(self.input3.size())\n        gradflow_weights = torch.zeros(self.flow_weights.size())\n\n        if self.input1.is_cuda:\n            # print(\"CUDA backward\")\n            gradinput1 = gradinput1.cuda(self.device)\n            #gradinput2 = gradinput2.cuda(self.device)\n            gradinput3 = gradinput3.cuda(self.device)\n            gradflow_weights = gradflow_weights.cuda(self.device)\n\n            err = my_lib.PixelValueLayer_gpu_backward(\n                self.input1,self.input3, self.flow_weights,\n                gradoutput,\n                gradinput1,  gradinput3, gradflow_weights,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n\n        else:\n            #print(\"CPU backward\")\n            # print(gradoutput)\n            err = my_lib.PixelValueLayer_cpu_backward(\n                self.input1,  self.input3, self.flow_weights,\n                gradoutput,\n                gradinput1,   gradinput3, gradflow_weights,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            # print(err)\n            if err != 0 :\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n        # print(\"from 2:\")\n        # print(gradinput3[0,0,...])\n        # print(\"Image grad:\")\n        # print(gradinput1[0,:,:4,:4])\n        # print(\"Flow grad:\")\n        # print(gradinput3[0,:,:4,:4])\n        # print(\"Flow_weights grad:\")\n        # print(gradflow_weights[0,:,:4,:4])\n        return gradinput1,  gradinput3, gradflow_weights\n\nclass PixelWeightLayer(Function):\n    def __init__(self,threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ):\n        super(PixelWeightLayer,self).__init__()\n        self.threshhold  = threshhold\n        self.sigma_d = sigma_d\n        self.tao_r = tao_r #maybe not useable\n        self.Prowindow = Prowindow\n\n    def forward(self, input3, flow_weights):\n\n        # assert(input1.is_contiguous())\n        # assert(input2.is_contiguous())\n        #self.input1 = input1.contiguous() # ref1 image\n        #self.input2 = input2.contiguous() # ref2 image\n        self.input3 = input3.contiguous() # ref1 flow\n        self.flow_weights = flow_weights.contiguous() # ref1 flow weights\n\n        if input3.is_cuda:\n            self.device = torch.cuda.current_device()\n        else:\n            self.device = -1\n\n        output =  torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)])\n\n        if input3.is_cuda :\n            output = output.cuda()            \n            err = my_lib.PixelWeightLayer_gpu_forward(\n                input3, flow_weights,   output,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            err = my_lib.PixelWeightLayer_cpu_forward(\n                input3, flow_weights, output,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n\n        self.output = output\n        # the function returns the output to its caller\n        return output\n\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\n    def backward(self, gradoutput):\n        # print(\"Backward of PixelWeightLayer Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        #gradinput1 = torch.zeros(self.input1.size())\n        #gradinput2 = torch.zeros(self.input2.size())\n        gradinput3 = torch.zeros(self.input3.size())\n        gradflow_weights = torch.zeros(self.flow_weights.size())\n\n        if self.input3.is_cuda:\n            # print(\"CUDA backward\")\n            #gradinput1 = gradinput1.cuda(self.device)\n            #gradinput2 = gradinput2.cuda(self.device)\n            gradinput3 = gradinput3.cuda(self.device)\n            gradflow_weights = gradflow_weights.cuda(self.device)\n\n            err = my_lib.PixelWeightLayer_gpu_backward(\n                self.input3, self.flow_weights,  self.output,\n                gradoutput,\n                gradinput3, gradflow_weights,\n                self.threshhold,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n\n        else:\n            # print(\"CPU backward\")\n            # print(gradoutput)\n            err = my_lib.PixelWeightLayer_cpu_backward(\n                self.input3, self.flow_weights, self.output,\n                gradoutput,\n                gradinput3, gradflow_weights,\n                self.threshhold,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            # print(err)\n            if err != 0 :\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n        # print(\"from 3:\")\n        # print(gradinput3[0,0,...])\n\n        return gradinput3, gradflow_weights\n\t\t\n#class ReliableValueLayer(Function):\n#    def __init__(self, Nw =3, tao_r =0.05, Prowindow = 2 ):\n#        super(ReliableValueLayer,self).__init__()\n#     \n#        self.Nw = Nw\n#        self.tao_r = tao_r #maybe not useable\n#        self.Prowindow = Prowindow\n#\n#    def forward(self, input3, flow_weight1):\n#\n#        # assert(input1.is_contiguous())\n#        # assert(input2.is_contiguous())\n#        #self.input1 = input1.contiguous() # ref1 image\n#        #self.input2 = input2.contiguous() # ref2 image\n#        self.input3 = input3.contiguous() # ref1 flow\n#        self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights\n#\n#        if input3.is_cuda:\n#            self.device = torch.cuda.current_device()\n#        else:\n#            self.device = -1\n#\n#        output =  torch.zeros([intpu3.size(0), 1, input3.size(2), input3.size(3)])\n#        #output2 =  torch.zeros(input1.size())\n#        #weight1 =  torch.zeros(input1.size())\n#        #weight2 =  torch.zeros(input1.size())\n#        \n#\n#        if input1.is_cuda :\n#            output = output.cuda()            \n#            my_lib.ReliableValueLayer_gpu_forward(\n#                        input3, flow_weight1, output,\n#                        self.sigma_d,    self.tao_r ,  self.Prowindow )\n#        else:\n#            # output = torch.cuda.FloatTensor(input1.data.size())\n#            my_lib.ReliableValueLayer_cpu_forward(\n#                        input3, flow_weight1, output,\n#                        self.sigma_d,    self.tao_r ,  self.Prowindow )\n#\n#        # the function returns the output to its caller\n#        return output\n#\n#    #TODO: if there are multiple outputs of this function, then the order should be well considered?\n#    def backward(self, gradoutput):\n#        # print(\"Backward of Filter Interpolation Layer\")\n#        # gradinput1 = input1.new().zero_()\n#        # gradinput2 = input2.new().zero_()\n#        #gradinput1 = torch.zeros(self.input1.size())\n#        #gradinput2 = torch.zeros(self.input2.size())\n#        gradinput3 = torch.zeros(self.input3.size())\n#        gradflow_weight1 = torch.zeros(self.flow_weight1.size())\n#        \n#        if self.input1.is_cuda:\n#            # print(\"CUDA backward\")\n#            #gradinput1 = gradinput1.cuda(self.device)\n#            #gradinput2 = gradinput2.cuda(self.device)\n#            gradinput3 = gradinput3.cuda(self.device)\n#            gradflow_weight1 = gradflow_weight1.cuda(self.device)\n#\n#            err = my_lib.ReliableValueLayer_gpu_backward(\n#                     self.input3, self.flow_weight1, gradoutput, \n#                     gradinput3,    gradflow_weight1,                        \n#                    self.sigma_d,    self.tao_r ,  self.Prowindow )\n#            if err != 0 :\n#                print(err)\n#\n#        else: \n#            # print(\"CPU backward\")\n#            # print(gradoutput)\n#            err = my_lib.ReliableValueLayer_cpu_backward(\n#                    self.input3,self.flow_weight1, gradoutput, \n#                    gradinput3,    gradflow_weight1,        \n#                    self.sigma_d,    self.tao_r ,  self.Prowindow )\n#            # print(err)\n#            if err != 0 :\n#                print(err)\n#            # print(gradinput1)\n#            # print(gradinput2)\n#\n#        # print(gradinput1)\n#\n#        return gradinput3,gradflow_weight1    \nclass ReliableWeightLayer(Function):\n    def __init__(self, threshhold, sigma_d =3, tao_r =0.05, Prowindow = 2 ):\n        super(ReliableWeightLayer,self).__init__()\n\n        self.threshhold = threshhold\n        self.sigma_d = sigma_d\n        self.tao_r = tao_r #maybe not useable\n        self.Prowindow = Prowindow\n\n    def forward(self, input3):\n\n        # assert(input1.is_contiguous())\n        # assert(input2.is_contiguous())\n        #self.input1 = input1.contiguous() # ref1 image\n        #self.input2 = input2.contiguous() # ref2 image\n        self.input3 = input3.contiguous() # ref1 flow\n        #self.flow_weight1 = flow_weight1.contiguous() # ref1 flow weights\n\n        if input3.is_cuda:\n            self.device = torch.cuda.current_device()\n        else:\n            self.device = -1\n\n        output =  torch.zeros([input3.size(0), 1, input3.size(2), input3.size(3)] )\n        #output2 =  torch.zeros(input1.size())\n        #weight1 =  torch.zeros(input1.size())\n        #weight2 =  torch.zeros(input1.size())\n\n        if input3.is_cuda :\n            output = output.cuda()            \n            err = my_lib.ReliableWeightLayer_gpu_forward(\n                input3, output,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            err = my_lib.ReliableWeightLayer_cpu_forward(\n                input3, output,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n        self.output= output # used for inihibiting some unreliable gradients.\n        # the function returns the output to its caller\n        return output\n\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\n    def backward(self, gradoutput):\n        #print(\"Backward of ReliableWeightLayer Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        #gradinput1 = torch.zeros(self.input1.size())\n        #gradinput2 = torch.zeros(self.input2.size())\n        gradinput3 = torch.zeros(self.input3.size())\n        #gradflow_weight1 = torch.zeros(self.flow_weight1.size())\n        \n        if self.input3.is_cuda:\n            #print(\"CUDA backward\")\n            #gradinput1 = gradinput1.cuda(self.device)\n            #gradinput2 = gradinput2.cuda(self.device)\n            gradinput3 = gradinput3.cuda(self.device)\n            #gradflow_weight1 = gradflow_weight1.cuda(self.device)\n\n            err = my_lib.ReliableWeightLayer_gpu_backward(\n                 self.input3,   self.output,\n                 gradoutput,\n                 gradinput3,\n                 self.threshhold,\n                 self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            if err != 0 :\n                print(err)\n\n        else:\n            # print(\"CPU backward\")\n            # print(gradoutput)\n            err = my_lib.ReliableWeightLayer_cpu_backward(\n                self.input3, self.output,\n                gradoutput,\n                gradinput3,\n                self.threshhold,\n                self.sigma_d,    self.tao_r ,  self.Prowindow\n            )\n            # print(err)\n            if err != 0 :\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n        # print(\"from 4:\")\n        # print(gradinput3[0,0,...])\n\n        return gradinput3"
  },
  {
    "path": "my_package/FilterInterpolation/FilterInterpolationModule.py",
    "content": "# modules/AdaptiveInterpolationLayer.py\nfrom torch.nn import Module\nimport torch\nfrom torch.autograd import Variable\nfrom torch.autograd import gradcheck\nfrom .FilterInterpolationLayer import FilterInterpolationLayer,WeightLayer, PixelValueLayer,PixelWeightLayer,ReliableWeightLayer\n\nclass FilterInterpolationModule(Module):\n    def __init__(self):\n        super(FilterInterpolationModule, self).__init__()\n        # self.f = FilterInterpolationLayer()\n\n    def forward(self, input1, input2, input3):\n        return FilterInterpolationLayer.apply(input1, input2, input3)\n\n    #we actually dont need to write the backward code for a module, since we have\n\n#class WeightModule(Module):\n#    def __init__(self):\n#        super(WeightModule, self).__init__()\n#        self.f = WeightLayer()\n#\n#    def forward(self, input1, input2, input3):\n#        return self.f(input1, input2, input3)\nclass AdaptiveWeightInterpolationModule(Module):\n    def __init__(self,  training = False, threshhold = 1e-6,\n                 lambda_e = 30.0/255.0, lambda_v = 1.0, Nw = 3.0,\n                 sigma_d =1.5,  tao_r = 0.05, Prowindow = 2 ):\n        super(AdaptiveWeightInterpolationModule, self).__init__()\n\n        self.calc_weight1 = WeightLayer(lambda_e, lambda_v, Nw )\n        self.padder1 = torch.nn.ReplicationPad2d([0, 1 , 0, 1])\n        self.interpolate1 = PixelValueLayer(sigma_d, tao_r , Prowindow)\n        self.interpolate1_1 = PixelWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow)\n        #        self.interpolate_R1 = ReliableValueLayer(Nw, tao_r , Prowindow)\n        self.interpolate_R1_1 = ReliableWeightLayer(101* threshhold, sigma_d,tao_r, Prowindow)\n        \n        self.calc_weight2 = WeightLayer(lambda_e, lambda_v,Nw)\n        self.padder2 = torch.nn.ReplicationPad2d([0, 1 , 0, 1])\n        self.interpolate2 = PixelValueLayer(sigma_d, tao_r , Prowindow )\n        self.interpolate2_1 = PixelWeightLayer(101*threshhold,sigma_d,tao_r, Prowindow)\n        #self.interpolate_R2 = ReliableValueLayer(Nw, tao_r , Prowindow)\n        self.interpolate_R2_1 = ReliableWeightLayer(101*threshhold, sigma_d,tao_r, Prowindow)\n\n        self.training = training\n        self.threshold = threshhold\n        return\n        #self.lambda_e = lambda_e\n        #self.lambda_v = lambda_v\n        #self.sigma_d = sigma_d\n        #self.Nw = Nw\n        #self.tao_r = tao_r #maybe not useable\n        #self.Prowindow = Prowindow\n        #    lambda_e = self.lambda_e , lambda_v = self.lambda_v,Nw = self.Nw\n        #    sigma_d = self.sigma_d,  tao_r = self.tao_r , Prowindow = self.Prowindow \n        #self.sigma_d,    self.tao_r ,  self.Prowindow \n\n\n    # input1 ==> ref1 image\n    # #input2 ==> ref2 image\n    # input3 ==> ref1 flow\n    # input4 ==> ref2 flow\n    def forward(self, input1, input2, input3, input4):\n        epsilon = 1e-6\n        #flow1_grad = torch.sum(torch.sqrt(\n        #                    (input3[:, :, :-1, :-1] - input3[:, :, 1:, :-1]) ** 2 +\n        #                    (input3[:, :, :-1, :-1] - input3[:, :, :-1, 1:]) ** 2 + epsilon * epsilon\n        #                ), dim = 1,keepdim =True)\n        #flow1_grad = self.padder1(flow1_grad)\n        # if input1.is_cuda:\n        #     err = gradcheck(self.calc_weight1,(Variable(input1.data,requires_grad=True),\n        #                                        Variable(input2 .data,requires_grad=True),\n        #                                        Variable(input3.data,requires_grad= True),\n        #                                         # Variable(flow1_grad.data,requires_grad=True)\n        #                                        ), eps=1e-3)\n        #     print(err)\n            # pass\n            #input1.requires_grad = True\n            #input2.requires_grad = True\n\n        flow_weight1 = self.calc_weight1(input1,input2,input3 )\n        # if flow1_grad.is_cuda:\n            # err = gradcheck(self.interpolate1,(Variable(input1.data,requires_grad=True),\n            #                                    Variable(input3.data,requires_grad= True),\n            #                                     Variable(flow_weight1.data,requires_grad=True)), eps=1e-3)\n            # err = gradcheck(self.interpolate1_1, (Variable(input3.data,requires_grad=True),\n            #                                       Variable(flow_weight1.data, requires_grad =True)),eps=1e-3)\n            # err = gradcheck(self.interpolate_R1_1,(input3,),eps=1e-3)\n            # print(err)\n        # print(flow_weight1[0,:,50:100,50:100])\n        p1 = self.interpolate1(input1, input3, flow_weight1)\n        p1_r,p1_g,p1_b = torch.split(p1,1,dim=1)\n        pw1 = self.interpolate1_1(input3, flow_weight1)\n        i1_r,i1_g,i1_b = (p1_r)/(pw1+self.threshold),\\\n                         (p1_g)/(pw1+self.threshold), \\\n                         (p1_b)/(pw1+self.threshold)\n        #if not self.training:\n        #    i1_r[pw1<=10*self.threshold], i1_g[pw1<=10*self.threshold], i1_b[pw1<=10*self.threshold] = 0,0,0\n        #i1 = torch.cat((i1_r,i1_g,i1_b),dim=1\n        #r1 = self.interpolate_R1(input3, flow_weight1)\n        r1 = pw1\n        rw1 = self.interpolate_R1_1(input3)\n        w1 = (r1)/(rw1+self.threshold)\n        # if torch.sum(w1 <= 0).cpu().data.numpy()[0] > 0:\n        #   pass\n            # print(\"there are holes in i1 :\" )\n            # print(torch.sum(w1 <= 0))\n        #if not self.training:\n        #    w1[rw1 <=10*self.threshold] = 0\n\n        # flow2_grad = torch.sum(torch.sqrt(\n        #                     (input4[:, :, :-1, :-1] - input4[:, :, 1:, :-1]) ** 2 +\n        #                     (input4[:, :, :-1, :-1] - input4[:, :, :-1, 1:]) ** 2 + epsilon * epsilon\n        #                 ), dim = 1,keepdim=True)\n        # flow2_grad = self.padder2(flow2_grad)\n\n        flow_weight2 = self.calc_weight2(input2,input1,input4)\n        p2 = self.interpolate2(input2, input4, flow_weight2)\n        p2_r,p2_g,p2_b = torch.split(p2,1,dim=1)\n        pw2 = self.interpolate2_1(input4, flow_weight2)\n        i2_r,i2_g,i2_b = (p2_r)/(pw2+self.threshold),\\\n                         (p2_g)/(pw2+self.threshold), \\\n                         (p2_b)/(pw2+self.threshold)\n        #if not self.training:\n        #    i2_r[pw2<=10*self.threshold], i2_g[pw2<=10*self.threshold], i2_b[pw2<=10*self.threshold] = 0,0,0\n        #i2 = torch.cat((p2[:,0,...] /pw2, p2[:,1,...] /pw2, p2[:,2,...]/pw2),dim=1)\n        #r2 = self.interpolate_R2(input4, flow_weight2)\n        r2 = pw2\n        rw2 = self.interpolate_R2_1(input4)\n        w2 = (r2)/(rw2+self.threshold)\n        #if torch.sum(w2 <= 0).cpu().data.numpy()[0] > 0:\n        #    pass\n        #    print(\"there are holes in i2 :\" )\n        #    print(torch.sum(w2 <= 0))\n        #if not self.training:\n        #    w2[rw2 <= 10*self.threshold] = 0\n        # i = (i1 * w1 + i2 * w2 )/ (w1 + w2)\n\n        w = w1+w2\n        i_r = (i1_r * w1 + i2_r * w2)/ (w + self.threshold) #(w1 + w2)\n        i_g = (i1_g * w1 + i2_g * w2)/ (w + self.threshold) #(w1 + w2)\n        i_b = (i1_b * w1 + i2_b * w2)/ (w + self.threshold) #(w1 + w2)\n        #if torch.sum(w <= 0).cpu().data.numpy()[0] > 0:\n        #    print(\"there are holes in i :\")\n        #    print(torch.sum(w <= 0))\n        if not self.training:\n            i_r[w<= 10*self.threshold], i_g[w<=10*self.threshold], i_b[w<=10*self.threshold] = 0,0,0\n            w[w <= 10 *self.threshold] = 0\n        i = torch.cat((i_r,i_g,i_b),dim=1)\n        return i\n"
  },
  {
    "path": "my_package/FilterInterpolation/__init__.py",
    "content": "from .FilterInterpolationModule import *\n"
  },
  {
    "path": "my_package/FilterInterpolation/filterinterpolation_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"filterinterpolation_cuda_kernel.cuh\"\r\n\r\n\r\n\r\nint FilterInterpolationLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  input3,\r\n\t\tat::Tensor&  output\r\n\r\n\t\t)\r\n\t\t{\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n\t//if(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != 2) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w) return error;\r\n\r\n    int filter_size2 = input3.size( 1);\r\n    int filter_size = (int) sqrt((float) filter_size2);\r\n//    printf(\"filter size is: %d,or %f\", filter_size, sqrt((float)filter_size2));\r\n\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n    int input3_b_stride = input3.stride(0);\r\n\tint input3_c_stride = input3.stride(1);\r\n\tint input3_h_stride = input3.stride(2);\r\n\tint input3_w_stride = input3.stride(3);\r\n//    printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n    if(input1_w_stride !=1) return error;\r\n\tif(input2_w_stride !=1) return error;\r\n    if(input3_w_stride !=1) return error;\r\n\tif(input1_b_stride != output.stride(0)) return error;\r\n\tif(input1_c_stride != output.stride(1)) return error;\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n\r\n\r\n\terror = FilterInterpolationLayer_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch, filter_size,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tinput3,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n\t\t}\r\nint FilterInterpolationLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  input3,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2,\r\n\t\tat::Tensor&  gradinput3\r\n\t\t)\r\n\t\t{\r\n\r\n\r\n    int error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\t//if(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != 2) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w) return error;\r\n\r\n\r\n    int filter_size2 = input3.size( 1);\r\n    int filter_size = (int) sqrt((float) filter_size2);\r\n//    printf(\"filter size is: %d,or %f\", filter_size, sqrt((float)filter_size2));\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n    int input3_b_stride = input3.stride(0);\r\n\tint input3_c_stride = input3.stride(1);\r\n\tint input3_h_stride = input3.stride(2);\r\n\tint input3_w_stride = input3.stride(3);\r\n//    printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\tif(input1_w_stride !=1) return error;\r\n\tif(input2_w_stride !=1) return error;\r\n    if(input3_w_stride !=1) return error;\r\n    if(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input2_b_stride != gradinput2.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\tif(input2_c_stride != gradinput2.stride(1)) return error;\r\n\tif(input3_c_stride != gradinput3.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = FilterInterpolationLayer_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch, filter_size,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tinput3,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2,\r\n\t\t\tgradinput3\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n}\r\n\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"FilterInterpolationLayer_gpu_forward\", &FilterInterpolationLayer_gpu_forward, \"FilterInterpolation forward (CUDA)\");\r\n  m.def(\"FilterInterpolationLayer_gpu_backward\", &FilterInterpolationLayer_gpu_backward, \"FilterInterpolation backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"filterinterpolation_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void FilterInterpolationLayer_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__    input1,    \t\tconst scalar_t* __restrict__    input2,    \tconst scalar_t* __restrict__    input3, \tscalar_t*   output\r\n\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\r\n\t\tfloat fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i  ];\r\n\t\tfloat fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i  ];\r\n\r\n\t\tfloat x2 = (float)(w_i) + fx;\r\n\t\tfloat y2 = (float)(h_i) + fy;\r\n\r\n\r\n\t\tif(x2 >= 0.0f && y2 >=0.0f && x2 <= (float)(w -1) && y2 <= (float)(h-1)\r\n            && fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){\r\n\t\t\tint ix2_L = int(x2) + 1 - (int)(filter_size / 2);\r\n\t\t\tint iy2_T = int(y2) + 1 - (int)(filter_size / 2);\r\n\t\t\tint ix2_R = ix2_L + filter_size;\r\n\t\t\tint iy2_B = iy2_T + filter_size;\r\n\r\n            float alpha = x2 - (int)(x2);\r\n            float beta = y2 - (int)(y2);\r\n\r\n\r\n\t\t\t//TODO: here is a bug that if the iy2_B or ix2_R gets out of the border, than there is no enough pixels to warp the target one.\r\n\t\t\tfor (int c_i = 0 ; c_i < channel ; c_i++){\r\n\r\n                float TL = 0.0f;\r\n                for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){\r\n                    int _filter_j = min(max(0, filter_j), h - 1);\r\n                    for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i ), w - 1);\r\n                    TL += input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *\r\n\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;\r\n                    }\r\n                }\r\n\r\n                float TR = 0.0f;\r\n                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float BL = 0.0f;\r\n                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float BR = 0.0f;\r\n                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] =\r\n                            (1-alpha)*(1-beta)*TL +\r\n\t\t\t\t\t\t\talpha*(1-beta)*TR +\r\n\t\t\t\t\t\t\t(1-alpha)*beta*BL +\r\n\t\t\t\t\t\t\talpha*beta*BR;\r\n\r\n//\t\t\t\t\tfor( int filter_i = ix2_L; filter_i < ix2_R ; filter_i ++ ){\r\n//\t\t\t\t\t\tint _filter_i = min(max(0, filter_i),w - 1);\r\n//\t\t\t\t\t\toutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ] +=\r\n//\t\t\t\t\t\t\tinput1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *\r\n//\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] *\r\n////\t\t\t\t\t\t\texp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)); // the distance weight\r\n//\t\t\t\t\t\t\texp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) ); // the distance weight\r\n//\r\n////\t\t\t\t\t\t\tif(w_i == 141 && h_i == 316 && c_i == 0 ){\r\n////printf(\"gpu: %f, %f,%f,%f\\n\",input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] ,\r\n////input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i],\r\n////exp( -(fabs((float) filter_j - y2) + fabs((float) filter_i - x2)) / (float)(filter_size)),\r\n////output[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ]\r\n//// );\r\n////}\r\n//\r\n//\t\t\t\t\t}\r\n//\t\t\t\t}\r\n\t\t\t}\r\n\t\t} else{\r\n\t\t\t//the warping data is out of range, we fill it with zeros\r\n\t\t\tfor(int c_i = 0 ;  c_i < channel; c_i ++){\r\n\t\t\t\toutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = input1[off + c_i* input1_c_stride+ h_i * input1_h_stride + w_i];\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void FilterInterpolationLayer_gpu_backward_kernelfunc(\r\n\t\tconst int nElement, \t   const int w, \t\tconst int h, \t\tconst int channel, \tconst int filter_size,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__      input1,        \t\tconst scalar_t* __restrict__      input2,\t\tconst scalar_t* __restrict__      input3,\r\n\t\tscalar_t* gradoutput,    \t\tscalar_t*  gradinput1,  \t\tscalar_t*  gradinput2,  \t\tscalar_t*  gradinput3\r\n\t\t)\r\n\t\t{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off  = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n\r\n\t\tfloat fx = input2[batch_i * input2_b_stride +  0 * input2_c_stride + h_i * input2_h_stride + w_i];\r\n\t\tfloat fy = input2[batch_i * input2_b_stride +  1 * input2_c_stride + h_i * input2_h_stride + w_i];\r\n\r\n\t\tfloat x2 = float(w_i) + fx;\r\n\t\tfloat y2 = float(h_i) + fy;\r\n\r\n\t\tif(x2 >= 0.0f  && y2 >= 0.0f && x2 <= (float)(w - 1) && y2 <= (float)(h -1)\r\n            && fabs(fx) < (float)(w)/2.0f && fabs(fy) < (float)(h)/2.0f){\r\n\t\t\tint ix2_L = int(x2) + 1 - (int) (filter_size/2);\r\n\t\t\tint iy2_T = int(y2) + 1 - (int) (filter_size/2);\r\n\t\t\tint ix2_R = ix2_L + filter_size;\r\n\t\t\tint iy2_B = iy2_T + filter_size;\r\n\r\n            float alpha = x2 - (int)(x2);\r\n            float beta = y2  - (int)(y2);\r\n\t\t\t/***\r\n\t\t\t  Step 1: calculate the gradients for input1, i.e. the input image;\r\n\t\t\t ***/\r\n            /***\r\n              STEP 3: calculate the gradients for input3, i.e. the filter\r\n             ***/\r\n             /***\r\n                Step 1 and Step 3 are simultaneously computed\r\n             ***/\r\n\t\t\tfor (int c_i = 0 ; c_i < channel; c_i++){\r\n\r\n\t\t\t\tfloat gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];\r\n\r\n                float TL_grad = gradoutput_value * (1-alpha ) * (1-beta);\r\n                for(int filter_j = iy2_T; filter_j <= (int) (y2) ; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j), h - 1);\r\n                    for (int filter_i = ix2_L   ; filter_i <= (int)(x2) ; filter_i ++){\r\n                    int _filter_i = min(max(0, filter_i), w - 1);\r\n                    atomicAdd( &gradinput1[off +c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],\r\n                                TL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *\r\n                                                                input3_c_stride + h_i * input3_h_stride + w_i]);\r\n                    atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *\r\n                                                                        input3_c_stride + h_i * input3_h_stride + w_i],\r\n                                TL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);\r\n\r\n                    }\r\n                }\r\n\r\n                float TR_grad= gradoutput_value * alpha * ( 1- beta);\r\n                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n\r\n                    atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],\r\n                                TR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *\r\n                                                                input3_c_stride + h_i * input3_h_stride + w_i]);\r\n                    atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *\r\n                                                                        input3_c_stride + h_i * input3_h_stride + w_i],\r\n                                TR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);\r\n\r\n                    }\r\n                    }\r\n\r\n                   float BL_grad = gradoutput_value * ( 1 - alpha ) * beta;\r\n                   for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                        int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                        for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){\r\n                            int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n\r\n                        atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],\r\n                                    BL_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *\r\n                                                                    input3_c_stride + h_i * input3_h_stride + w_i]);\r\n                        atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *\r\n                                                                            input3_c_stride + h_i * input3_h_stride + w_i],\r\n                                    BL_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);\r\n\r\n                    }\r\n                    }\r\n\r\n                float BR_grad = gradoutput_value * alpha * beta;\r\n                 for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                    for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){\r\n                        int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                        atomicAdd( &gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i ],\r\n                                    BR_grad * input3[batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) *\r\n                                                                    input3_c_stride + h_i * input3_h_stride + w_i]);\r\n                        atomicAdd( & gradinput3[batch_i * input3_b_stride + ((filter_j - iy2_T ) * filter_size + (filter_i - ix2_L)) *\r\n                                                                            input3_c_stride + h_i * input3_h_stride + w_i],\r\n                                    BR_grad * input1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i]);\r\n                        }\r\n                }\r\n//\t\t\t\tfor ( int filter_j = iy2_T; filter_j < iy2_B ; filter_j ++ ){\r\n//\t\t\t\t\tint _filter_j = min(max(0, filter_j),  h - 1);\r\n//\t\t\t\t\tfor( int filter_i = ix2_L; filter_i< ix2_R ; filter_i++){\r\n//\t\t\t\t\t\tint _filter_i = min(max(0,filter_i), w - 1);\r\n//\t\t\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i],\r\n//\t\t\t\t\t\t\t\tgradoutput_value *\r\n//\t\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride + ((filter_j  - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i] *\r\n////\t\t\t\t\t\t\t\texp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2))/(float)filter_size)\r\n//                                exp( -(fabs((float)filter_j - y2) + fabs((float)filter_i - x2)))\r\n//\r\n//\t\t\t\t\t\t\t );\r\n//\t\t\t\t\t}\r\n//\t\t\t\t}\r\n\r\n\t\t\t}\r\n\r\n\t\t\t/***\r\n\t\t\t  Step 2: calculate the gradients for input2, i.e., the optical flow,\r\n\t\t\t  STEP 2.1: for the x/horizonotal direction.\r\n\t\t\t ***/\r\n            float gamma  =  1.0f - beta; //iy2_B - y2;\r\n\t\t\tfloat bot_diff = 0.0f;\r\n\t\t\tfor(int c_i =0 ; c_i< channel; c_i ++ ){\r\n\t\t\t\tfloat gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];\r\n\r\n    float TL = 0.0f;\r\n                for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){\r\n                    int _filter_j = min(max(0, filter_j), h - 1);\r\n                    for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i ), w - 1);\r\n                    TL += input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *\r\n\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;\r\n                    }\r\n                }\r\n\r\n                float TR = 0.0f;\r\n                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float BL = 0.0f;\r\n                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float BR = 0.0f;\r\n                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n\t            float temp = 0.0f;\r\n                temp += gamma * (TR - TL);\r\n                temp += (1-gamma) * (BR - BL);\r\n                bot_diff += gradoutput_value * temp;\r\n//\t\t\t\tfor( int filter_j = iy2_T; filter_j< iy2_B; filter_j++){\r\n//\t\t\t\t\tint _filter_j = min(max(0, filter_j) , h - 1);\r\n//\t\t\t\t\tfor( int filter_i = ix2_L; filter_i< ix2_R; filter_i ++){\r\n//\t\t\t\t\t\tint _filter_i = min(max(0,filter_i), w-1);\r\n//\r\n//\t\t\t\t\t\tbot_diff +=\r\n//\t\t\t\t\t\t\tgradoutput_value *\r\n//\t\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n//\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L))* input3_c_stride + h_i * input3_h_stride + w_i   ] *\r\n////\t\t\t\t\t\t\texp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))/ (float)filter_size) *\r\n////\t\t\t\t\t\t\t((float) filter_i > x2 ? 1.0f : -1.0f) / (float)filter_size;\r\n//                        \texp( - ( fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))) *\r\n//\t\t\t\t\t\t\t((float) filter_i > x2 ? 1.0f : -1.0f);\r\n//\t\t\t\t\t}\r\n//\t\t\t\t}\r\n\t\t\t}\r\n\t\t\t//the gradients of the x direction/ horizontal direction\r\n\t\t\tgradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;\r\n\r\n\t\t\t/***\r\n\t\t\t  STEP 2.2: for the x/horizonotal direction.\r\n\t\t\t ***/\r\n            gamma =  1.0f - alpha; //ix2_R -x2;\r\n\t\t\tbot_diff = 0.0f;\r\n\t\t\tfor(int c_i = 0 ; c_i < channel; c_i ++ ){\r\n\t\t\t\tfloat gradoutput_value = gradoutput [ off + c_i * input1_c_stride + h_i * input1_h_stride +w_i];\r\n\r\n                float TL = 0.0f;\r\n                for(int filter_j = iy2_T; filter_j <= (int)(y2); filter_j ++){\r\n                    int _filter_j = min(max(0, filter_j), h - 1);\r\n                    for( int filter_i = ix2_L; filter_i <= (int) ( x2) ; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i ), w - 1);\r\n                    TL += input1[off + c_i *  input1_c_stride +  _filter_j * input1_h_stride + _filter_i ] *\r\n\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i] ;\r\n                    }\r\n                }\r\n\r\n                float TR = 0.0f;\r\n                for (int filter_j = iy2_T; filter_j <= (int) (y2); filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i =  (int) (x2) + 1 ; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    TR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float BL = 0.0f;\r\n                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i = ix2_L; filter_i <= (int) (x2); filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    BL += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float BR = 0.0f;\r\n                for (int filter_j = (int) (y2) + 1; filter_j < iy2_B; filter_j ++ ){\r\n                    int _filter_j = min(max(0, filter_j),h - 1); // only used for input1\r\n                for (int filter_i = (int) (x2) + 1; filter_i < ix2_R; filter_i ++ ){\r\n                    int _filter_i = min(max(0, filter_i),w - 1);// only used for input1\r\n                    BR += input1 [off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n                        input3 [batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i];\r\n                }\r\n                }\r\n\r\n                float temp = 0.0f;\r\n                temp += gamma * (BL - TL);\r\n                temp += (1.0f - gamma) * ( BR - TR);\r\n                bot_diff += gradoutput_value * temp;\r\n\r\n//\t\t\t\tfor( int filter_j = iy2_T; filter_j < iy2_B; filter_j ++ ){\r\n//\t\t\t\t\tint _filter_j = min(max(0, filter_j), h - 1);\r\n//\t\t\t\t\tfor( int filter_i = ix2_L; filter_i < ix2_R; filter_i ++){\r\n//\t\t\t\t\t\tint _filter_i = min(max(0, filter_i), w - 1);\r\n//\r\n//\t\t\t\t\t\tbot_diff +=\r\n//\t\t\t\t\t\t\tgradoutput_value *\r\n//\t\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n//\t\t\t\t\t\t\tinput3 [batch_i * input3_b_stride +((filter_j - iy2_T) * filter_size + ( filter_i - ix2_L)) * input3_c_stride + h_i * input3_h_stride + w_i ] *\r\n////\t\t\t\t\t\t\texp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2))/ (float)filter_size  ) *\r\n////\t\t\t\t\t\t\t((float) filter_j > y2 ? 1.0f : - 1.0f ) / (float)filter_size;\r\n//\t\t\t\t\t\t\texp( - (fabs((float) filter_j - y2) + fabs((float) filter_i - x2))  ) *\r\n//\t\t\t\t\t\t\t((float) filter_j > y2 ? 1.0f : - 1.0f );\r\n//\t\t\t\t\t}\r\n//\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tgradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;\r\n\t\t\t/***\r\n\t\t\t  STEP 3: calculate the gradients for input3, i.e. the filter\r\n\t\t\t ***/\r\n//\t\t\tfor(int c_i  = 0 ; c_i <channel ; c_i ++ ){\r\n//\t\t\t\tfloat gradoutput_value = gradoutput[ off + c_i * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n//\t\t\t\tfor( int filter_j=  iy2_T ; filter_j < iy2_B; filter_j ++ ){\r\n//\t\t\t\t\tint _filter_j = min(max(0, filter_j), h -1 );\r\n//\t\t\t\t\tfor ( int filter_i  = ix2_L; filter_i < ix2_R; filter_i ++ ){\r\n//\t\t\t\t\t\tint _filter_i  = min(max(0, filter_i ), w - 1);\r\n//\r\n//\t\t\t\t\t\tgradinput3 [  batch_i * input3_b_stride + ((filter_j - iy2_T) * filter_size + (filter_i - ix2_L  ) ) * input3_c_stride + h_i * input3_h_stride + w_i] +=\r\n//\t\t\t\t\t\t\tgradoutput_value *\r\n//\t\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + _filter_j * input1_h_stride + _filter_i] *\r\n////\t\t\t\t\t\t\texp( -(fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2))/ (float)filter_size);\r\n//\t\t\t\t\t\t\texp( -(fabs((float) filter_j - y2 ) + fabs((float) filter_i - x2)));\r\n//\t\t\t\t\t}\r\n//\t\t\t\t}\r\n//\t\t\t}\r\n\t\t}\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\nint FilterInterpolationLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const  int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\r\n\t\tat::Tensor&  input1,    \t\tat::Tensor&  input2,    \tat::Tensor&  input3, \tat::Tensor&  output\r\n\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\nFilterInterpolationLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,filter_size,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),input3.data<scalar_t>(), output.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\nint FilterInterpolationLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,    \t\tconst int h,    \t\tconst int channel,  \t\tconst int batch,    \t\tconst int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\r\n\t\tat::Tensor&  input1,        \t\tat::Tensor&  input2,\t\tat::Tensor&  input3,\r\n\r\n\t\tat::Tensor&  gradoutput,    \t\tat::Tensor&  gradinput1,  \t\tat::Tensor&  gradinput2,  \t\tat::Tensor&  gradinput3\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\r\n//    cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));\r\n//    cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));\r\n//    cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));\r\n\r\n\t\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\nFilterInterpolationLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,filter_size,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\r\n\r\n\t\t\tinput1.data<scalar_t>(), \t\t\tinput2.data<scalar_t>(),         input3.data<scalar_t>(),  \t\t\tgradoutput.data<scalar_t>(),\r\n\t\t\tgradinput1.data<scalar_t>(), \t\t\tgradinput2.data<scalar_t>(),     gradinput3.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n"
  },
  {
    "path": "my_package/FilterInterpolation/filterinterpolation_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint FilterInterpolationLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const  int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\r\n\t\tat::Tensor& input1,    \t\tat::Tensor& input2,    \tat::Tensor& input3, \tat::Tensor& output\r\n\r\n\t\t);\r\n\r\nint FilterInterpolationLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,    \t\tconst int h,    \t\tconst int channel,  \t\tconst int batch,    \t\tconst int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\r\n\t\tat::Tensor& input1,        \t\tat::Tensor& input2,\t\tat::Tensor& input3,\r\n\r\n\t\tat::Tensor& gradoutput,    \t\tat::Tensor& gradinput1,  \t\tat::Tensor& gradinput2,  \t\tat::Tensor& gradinput3\r\n\t\t);\r\n"
  },
  {
    "path": "my_package/FilterInterpolation/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='filterinterpolation_cuda',\n    ext_modules=[\n        CUDAExtension('filterinterpolation_cuda', [\n            'filterinterpolation_cuda.cc',\n            'filterinterpolation_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/FlowProjection/FlowProjectionLayer.py",
    "content": "# this is for wrapping the customized layer\nimport torch\nfrom torch.autograd import Function\nimport flowprojection_cuda as my_lib\n\n#Please check how the STN FUNCTION is written :\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py\n\nclass FlowProjectionLayer(Function):\n    def __init__(self,requires_grad):\n        super(FlowProjectionLayer,self).__init__()\n        self.requires_grad = requires_grad\n\n    @staticmethod\n    def forward(ctx, input1, requires_grad):\n        assert(input1.is_contiguous())\n        # assert(input2.is_contiguous())\n        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\n\n        fillhole = 1 if requires_grad == False else 0\n        # if input1.is_cuda:\n        #     self.device = torch.cuda.current_device()\n        # else:\n        #     self.device = -1\n\n        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections\n        # output = torch.zeros(input1.size())\n\n        if input1.is_cuda :\n            # output = output.cuda()\n            # count = count.cuda()\n            count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()\n            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.FlowProjectionLayer_gpu_forward(input1, count,output, fillhole)\n        else:\n            output = torch.cuda.FloatTensor(input1.data.size())\n            err = my_lib.FlowProjectionLayer_cpu_forward(input1, count, output, fillhole)\n        if err != 0:\n            print(err)\n        # output = output/count # to divide the counter\n\n        ctx.save_for_backward(input1, count)\n        ctx.fillhole = fillhole\n        # self.count = count #to keep this\n        # print(self.input1[0, 0, :10, :10])\n        # print(self.count[0, 0, :10, :10])\n        # print(self.input1[0, 0, -10:, -10:])\n        # print(self.count[0, 0, -10:, -10:])\n\n        # the function returns the output to its caller\n        return output\n\n    @staticmethod\n    def backward(ctx, gradoutput):\n        # print(\"Backward of Filter Interpolation Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        # gradinput1 = torch.zeros(self.input1.size())\n\n        input1, count, output = ctx.saved_tensors\n\n        if input1.is_cuda:\n            # print(\"CUDA backward\")\n            # gradinput1 = gradinput1.cuda(self.device)\n            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.FlowProjectionLayer_gpu_backward(input1, count, gradoutput, gradinput1)\n            # print(err)\n            if err != 0 :\n                print(err)\n\n        else:\n            # print(\"CPU backward\")\n            # print(gradoutput)\n            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.FlowProjectionLayer_cpu_backward(input1, count,  gradoutput, gradinput1)\n            # print(err)\n            if err != 0:\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n\n        # print(gradinput1)\n\n        return gradinput1, None\n\nclass FlowFillholelayer(Function):\n    def __init__(self):\n        super(FlowFillholelayer,self).__init__()\n\n    def forward(self, input1):\n        # assert(input1.is_contiguous())\n        # assert(input2.is_contiguous())\n        self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\n\n        if input1.is_cuda:\n            self.device = torch.cuda.current_device()\n        else:\n            self.device = -1\n\n        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections\n        output = torch.zeros(input1.size())\n\n        if input1.is_cuda :\n            output = output.cuda()\n            # count = count.cuda()\n            err = my_lib.FlowFillholelayer_gpu_forward(input1, output)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            err = my_lib.FlowFillholelayer_cpu_forward(input1, output)\n        if err != 0:\n            print(err)\n        # output = output/count # to divide the counter\n\n        # self.count = count #to keep this\n        # print(self.input1[0, 0, :10, :10])\n        # print(self.count[0, 0, :10, :10])\n        # print(self.input1[0, 0, -10:, -10:])\n        # print(self.count[0, 0, -10:, -10:])\n\n        # the function returns the output to its caller\n        return output\n\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\n    # def backward(self, gradoutput):\n    #     # print(\"Backward of Filter Interpolation Layer\")\n    #     # gradinput1 = input1.new().zero_()\n    #     # gradinput2 = input2.new().zero_()\n    #     gradinput1 = torch.zeros(self.input1.size())\n    #     if self.input1.is_cuda:\n    #         # print(\"CUDA backward\")\n    #         gradinput1 = gradinput1.cuda(self.device)\n    #         err = my_lib.FlowProjectionLayer_gpu_backward(self.input1, self.count, gradoutput, gradinput1)\n    #         # print(err)\n    #         if err != 0 :\n    #             print(err)\n    #\n    #     else:\n    #         # print(\"CPU backward\")\n    #         # print(gradoutput)\n    #         err = my_lib.FlowProjectionLayer_cpu_backward(self.input1, self.count,  gradoutput, gradinput1)\n    #         # print(err)\n    #         if err != 0:\n    #             print(err)\n    #         # print(gradinput1)\n    #         # print(gradinput2)\n    #\n    #     # print(gradinput1)\n    #\n    #     return gradinput1"
  },
  {
    "path": "my_package/FlowProjection/FlowProjectionModule.py",
    "content": "# modules/FlowProjectionModule.py\nfrom torch.nn import Module\nfrom .FlowProjectionLayer import FlowProjectionLayer #, FlowFillholeLayer\n\nclass FlowProjectionModule(Module):\n    def __init__(self, requires_grad = True):\n        super(FlowProjectionModule, self).__init__()\n\n        self.f = FlowProjectionLayer(requires_grad)\n\n    def forward(self, input1):\n        return self.f(input1)\n\n# class FlowFillholeModule(Module):\n#     def __init__(self,hole_value = -10000.0):\n#         super(FlowFillholeModule, self).__init__()\n#         self.f = FlowFillholeLayer()\n#\n#     def forward(self, input1):\n#         return self.f(input1)\n\n    #we actually dont need to write the backward code for a module, since we have\n\n"
  },
  {
    "path": "my_package/FlowProjection/__init__.py",
    "content": "from  .FlowProjectionModule import *"
  },
  {
    "path": "my_package/FlowProjection/flowprojection_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"flowprojection_cuda_kernel.cuh\"\r\n\r\nint FlowProjectionLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  output,\r\n\t\tint fillhole\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n\tif(channel!= 2) return error;\r\n\tint batch = input1.size(0);\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint count_b_stride = count.stride(0);\r\n\tint count_c_stride = count.stride(1);\r\n\tint count_h_stride = count.stride(2);\r\n\tint count_w_stride = count.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != output.stride(0)) return error;\r\n\tif(input1_c_stride != output.stride(1)) return error;\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n//    printf(\"In gpu forward\\n\");\r\n\terror = FlowProjection_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch,fillhole,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tcount_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tcount,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\nint FlowProjectionLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n        at::Tensor&  count,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=2) return error;\r\n\tint batch = input1.size(0);\r\n\tif(count.size(0) != batch) return error;\r\n\tif(count.size(1) != 1) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(count.size(2) != h) return error;// to add some checkpoint\r\n\tif(count.size(3) != w) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint count_b_stride = count.stride(0);\r\n\tint count_c_stride = count.stride(1);\r\n\tint count_h_stride = count.stride(2);\r\n\tint count_w_stride = count.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", count_b_stride,count_c_stride,count_h_stride,count_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = FlowProjection_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tcount_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tcount,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"FlowProjectionLayer_gpu_forward\", &FlowProjectionLayer_gpu_forward, \"FlowProjection forward (CUDA)\");\r\n  m.def(\"FlowProjectionLayer_gpu_backward\", &FlowProjectionLayer_gpu_backward, \"FlowProjection backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/FlowProjection/flowprojection_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"flowprojection_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void FlowProjection_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__    input1,\r\n\t\tscalar_t*  count,\r\n\t\tscalar_t*  output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n        float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n        float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n\r\n        float x2 = (float) (w_i) + fx;\r\n        float y2 = (float) (h_i) + fy;\r\n        if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){\r\n            int ix2_L = (int) (x2);\r\n            int iy2_T = (int) (y2);\r\n            int ix2_R = min(ix2_L + 1, w - 1);\r\n            int iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] ,-fx);\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ],-fx);\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] ,-fx);\r\n            atomicAdd(&output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ],-fx);\r\n\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] , -fy);\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]  , -fy);\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]  , -fy);\r\n            atomicAdd(&output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]  , -fy);\r\n\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L], 1);\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] , 1);\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] , 1);\r\n            atomicAdd(& count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] , 1);\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\ntemplate <typename scalar_t>\r\n__global__ void FlowProjectionAveraging_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__      input1,\r\n\t\tscalar_t* count,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\t    float temp =count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;\r\n        if(temp > 0.0f){\r\n            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;\r\n            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] /= temp;\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void FlowFillhole_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__ input1,\r\n\t\tscalar_t*\tcount,\r\n\t\tscalar_t*\toutput\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\t    float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;\r\n        if(temp <= 0.0f){\r\n            //search along the four directions,0/90/180/270, until finding at least one\r\n            int left_offset = w_i;            float left_temp = 0.0f;\r\n            while(left_temp == 0.0f && left_offset - 1 >= 0){\r\n                left_offset = left_offset - 1;\r\n                left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;\r\n            }\r\n\r\n            int right_offset = w_i ;            float right_temp = 0.0f;\r\n            while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){\r\n                right_offset  = right_offset + 1 ;\r\n                right_temp =  count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;\r\n            }\r\n\r\n            int up_offset = h_i ;            float up_temp = 0.0f;\r\n            while(up_temp == 0.0f && up_offset - 1 >=0){\r\n                up_offset = up_offset - 1;\r\n                up_temp =  count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;\r\n            }\r\n\r\n            int down_offset = h_i;            float down_temp = 0.0f;\r\n            while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){\r\n                down_offset = down_offset + 1;\r\n                down_temp =  count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;\r\n            }\r\n\r\n            if(left_temp + right_temp + up_temp + down_temp <=0.0f){\r\n                //printf(\"Can't fill hole, find no neighbor vectors availabel\\n\");\r\n                return;\r\n            }\r\n\r\n            left_temp = (left_temp > 0.0f)?1:0;\r\n            right_temp = (right_temp > 0.0f)?1:0;\r\n            up_temp = (up_temp > 0.0f)?1:0;\r\n            down_temp = (down_temp > 0.0f)?1:0;\r\n\r\n            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (\r\n                left_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +\r\n                right_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+\r\n                up_temp *  output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +\r\n                down_temp *  output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]\r\n            )/(\r\n                left_temp + right_temp + up_temp + down_temp\r\n            ) ;\r\n\r\n\r\n            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(\r\n                left_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +\r\n                right_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+\r\n                up_temp *  output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +\r\n                down_temp *  output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]\r\n            )/(\r\n                left_temp + right_temp + up_temp + down_temp\r\n            ) ;\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\ntemplate <typename scalar_t>\r\n__global__ void FlowProjection_gpu_backward_kernelfunc(\r\n\t\tconst int nElement,  \tconst int w, \tconst int h, const int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__        input1,\r\n\t\tconst scalar_t* __restrict__       count,\r\n\t\tconst scalar_t* __restrict__       gradoutput,\r\n\t\tscalar_t*   gradinput1\r\n\t\t)\r\n{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off  = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n        float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;\r\n        float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;\r\n\r\n        float x2 = (float) ( w_i ) + fx;\r\n        float y2 = (float) ( h_i ) + fy;\r\n        if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){\r\n            int ix2_L = (int)(x2);\r\n            int iy2_T = (int)(y2);\r\n            int ix2_R  = min(ix2_L + 1, w-1);\r\n            int iy2_B  = min(iy2_T + 1, h-1);\r\n\r\n            int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;\r\n            gradinput1[iu_offset] += -  gradoutput[off +  0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/\r\n                                        count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L]  ;\r\n            gradinput1[iu_offset] += -    gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]/\r\n                                         count[batch_i * count_b_stride +0 + iy2_T * count_h_stride  + ix2_R]          ;\r\n            gradinput1[iu_offset ] += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/\r\n                                         count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]  ;\r\n            gradinput1[iu_offset ]  += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/\r\n                                         count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R]   ;\r\n\r\n            int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;\r\n            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]/\r\n                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L]  ;\r\n            gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R]/\r\n                                         count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]  ;\r\n            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]/\r\n                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]     ;\r\n            gradinput1[iv_offset] += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]/\r\n                                    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R]   ;\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\nint FlowProjection_gpu_forward_kernel(\r\n\t\tcudaStream_t stream, \t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int fillhole,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n{\r\n    int error = 1 ;\r\n\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n//    printf(\"I am here\\n\");\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"FlowProjection_gpu_forward_kernelfunc\", ([&] {\r\n\tFlowProjection_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tcount_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n\t\t\t);\r\n\t\t\t\t\t\t\t\t}));\r\n\r\n    cudaError_t err = cudaGetLastError();\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"I am there\\n\");\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"FlowProjectionAveraging_kernelfunc\", ([&] {\r\n\r\n    FlowProjectionAveraging_kernelfunc<<<grid,block,0,stream>>>(\r\n    \t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tcount_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n    );\t\t\t\t\r\n\t}));\r\n\r\n//    printf(\"I am kao\\n\");\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n    err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"I am dd\\n\");\r\n\r\n    if(fillhole){\r\n\r\n//        printf(\"use flow fill hole\\n\");\r\n    \tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"FlowFillhole_kernelfunc\", ([&] {\r\n    FlowFillhole_kernelfunc<<<grid,block,0,stream>>>(\r\n    \t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tcount_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n        );\r\n\t\t\t\t\t}));\r\n\r\n    err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\treturn error;\r\n\t}\r\n\r\n    }\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nint FlowProjection_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n    \tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"FlowProjection_gpu_backward_kernelfunc\", ([&] {\r\n\tFlowProjection_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tcount_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),\r\n\t\t\tcount.data<scalar_t>(),\r\n\t\t\tgradoutput.data<scalar_t>(),\r\n\t\t\tgradinput1.data<scalar_t>()\r\n\t\t\t);\r\n\t\t}));\r\n\r\n//    printf(\"gpu I am there\\n\");\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpu error in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"gpu I am here\\n\");\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n\r\n}\r\n"
  },
  {
    "path": "my_package/FlowProjection/flowprojection_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint FlowProjection_gpu_forward_kernel(\r\n\t\tcudaStream_t stream, \t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int fillhole,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor& input1,\r\n\t\tat::Tensor& count,\r\n\t\tat::Tensor& output\r\n\r\n\t\t);\r\n\r\nint FlowProjection_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor& input1,\r\n\t\tat::Tensor& count,\r\n\t\tat::Tensor& gradoutput,\r\n\t\tat::Tensor& gradinput1\r\n\t\t);\r\n\r\n\r\n"
  },
  {
    "path": "my_package/FlowProjection/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='flowprojection_cuda',\n    ext_modules=[\n        CUDAExtension('flowprojection_cuda', [\n            'flowprojection_cuda.cc',\n            'flowprojection_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/Interpolation/InterpolationLayer.py",
    "content": "# this is for wrapping the customized layer\r\nimport torch\r\nfrom torch.autograd import Function\r\nimport interpolation_cuda as my_lib\r\n\r\n#Please check how the STN FUNCTION is written :\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py\r\n\r\nclass InterpolationLayer(Function):\r\n    def __init__(self):\r\n        super(InterpolationLayer,self).__init__()\r\n\r\n    @staticmethod\r\n    def forward(ctx, input1,input2):\r\n\r\n        assert(input1.is_contiguous())\r\n        assert(input2.is_contiguous())\r\n        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\r\n        # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?\r\n        # if input1.is_cuda:\r\n        #     self.device = torch.cuda.current_device()\r\n        # else:\r\n        #     self.device = -1\r\n\r\n        # output =  torch.zeros(input1.size())\r\n\r\n\r\n        if input1.is_cuda :\r\n            # output = output.cuda()\r\n            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\r\n            my_lib.InterpolationLayer_gpu_forward(input1, input2, output)\r\n        else:\r\n            output = torch.cuda.FloatTensor(input1.data.size())\r\n            my_lib.InterpolationLayer_cpu_forward(input1, input2, output)\r\n        ctx.save_for_backward(input1, input2)\r\n\r\n        # the function returns the output to its caller\r\n        return output\r\n\r\n    @staticmethod\r\n    def backward(ctx, gradoutput):\r\n        # print(\"Backward of Interpolation Layer\")\r\n        # gradinput1 = input1.new().zero_()\r\n        # gradinput2 = input2.new().zero_()\r\n        # gradinput1 = torch.zeros(self.input1.size())\r\n        # gradinput2 = torch.zeros(self.input2.size())\r\n        input1, input2 = ctx.saved_tensors\r\n\r\n        if input1.is_cuda:\r\n            # print(\"CUDA backward\")\r\n            # gradinput1 = gradinput1.cuda(self.device)\r\n            # gradinput2 = gradinput2.cuda(self.device)\r\n            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\r\n            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()\r\n\r\n            # the input1 image should not require any gradients\r\n            # print(\"Does input1 requires gradients? \" + str(self.input1.requires_grad))\r\n\r\n            err = my_lib.InterpolationLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2)\r\n            if err != 0 :\r\n                print(err)\r\n        else:\r\n            # print(\"CPU backward\")\r\n            # print(gradoutput)\r\n            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()\r\n            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()\r\n            err = my_lib.InterpolationLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2)\r\n            # print(err)\r\n        if err != 0 :\r\n            print(err)\r\n            # print(gradinput1)\r\n            # print(gradinput2)\r\n\r\n        # print(gradinput1)\r\n\r\n        return gradinput1, gradinput2"
  },
  {
    "path": "my_package/Interpolation/InterpolationModule.py",
    "content": "# modules/InterpolationLayer.py\r\nfrom torch.nn import Module\r\nfrom .InterpolationLayer import InterpolationLayer\r\n\r\nclass InterpolationModule(Module):\r\n    def __init__(self):\r\n        super(InterpolationModule, self).__init__()\r\n        # self.f = InterpolationLayer()\r\n\r\n    def forward(self, input1, input2):\r\n        return InterpolationLayer.apply(input1, input2)\r\n\r\n    #we actually dont need to write the backward code for a module, since we have \r\n\r\n"
  },
  {
    "path": "my_package/Interpolation/__init__.py",
    "content": "from  .InterpolationModule import *"
  },
  {
    "path": "my_package/Interpolation/interpolation_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"interpolation_cuda_kernel.cuh\"\r\n\r\n\r\nint InterpolationLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n\t\t{\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != 2) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != output.stride(0)) return error;\r\n\tif(input1_c_stride != output.stride(1)) return error;\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n\r\n\terror =InterpolationLayer_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(),\r\n\t\t\tnElement,w,h,channel,batch,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nint InterpolationLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n    {\r\n\tint error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != 2) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input2_b_stride != gradinput2.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\tif(input2_c_stride != gradinput2.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = InterpolationLayer_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"InterpolationLayer_gpu_forward\", &InterpolationLayer_gpu_forward, \"Interpolation forward (CUDA)\");\r\n  m.def(\"InterpolationLayer_gpu_backward\", &InterpolationLayer_gpu_backward, \"Interpolation backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/Interpolation/interpolation_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"interpolation_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void InterpolationLayer_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__    input1,\r\n\t\tconst scalar_t* __restrict__    input2,\r\n\t\tscalar_t*   output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\r\n\t\tfloat fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i  ];\r\n\t\tfloat fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i  ];\r\n\r\n\t\tfloat x2 = (float)(w_i) + fx;\r\n\t\tfloat y2 = (float)(h_i) + fy;\r\n\r\n\t\tif(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){\r\n\t\t\tint ix2_L = int(x2);\r\n\t\t\tint iy2_T = int(y2);\r\n\t\t\tint ix2_R = min(ix2_L + 1, w - 1);\r\n\t\t\tint iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n\t\t\tfloat alpha = x2 - ix2_L;\r\n\t\t\tfloat beta = y2 - iy2_T;\r\n\r\n\t\t\tfor(int c_i = 0 ; c_i < channel ; c_i ++){\r\n\t\t\t\tfloat TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L];\r\n\t\t\t\tfloat TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R];\r\n\t\t\t\tfloat BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L];\r\n\t\t\t\tfloat BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R];\r\n\t\t\t\toutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] =\r\n\t\t\t\t\t(1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR;\r\n\t\t\t}\r\n\t\t} else{\r\n\t\t\t//the warping data is out of range, we fill it with zeros\r\n\t\t\tfor(int c_i = 0 ;  c_i < channel; c_i ++){\r\n\t\t\t\toutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue;\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\r\n\treturn ;\r\n\r\n}\r\n \r\n\r\ntemplate <typename scalar_t>\r\n__global__ void InterpolationLayer_gpu_backward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__    input1,\r\n\t\tconst scalar_t* __restrict__    input2,\r\n\t\tconst scalar_t* __restrict__    gradoutput,\r\n\t\tscalar_t*  gradinput1,\r\n\t\tscalar_t*  gradinput2\r\n\t\t)\r\n{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off  = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n\r\n\t\tfloat fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];\r\n\t\tfloat fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i];\r\n\r\n\t\tfloat x2 = float(w_i) + fx;\r\n\t\tfloat y2 = float(h_i) + fy;\r\n\r\n\t\tif(x2 >= 0.0f  && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){\r\n\t\t\tint ix2_L = int(x2);\r\n\t\t\tint iy2_T = int(y2);\r\n\r\n\t\t\tint ix2_R  = min(ix2_L+ 1, w - 1);\r\n\t\t\tint iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n\t\t\tfloat alpha = x2 - ix2_L;\r\n\t\t\tfloat beta = y2 - iy2_T;\r\n\r\n\t\t\tfor (int c_i = 0 ; c_i < channel; c_i++){\r\n\t\t\t\tfloat gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];\r\n\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta));\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta));\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta);\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta);\r\n\r\n\t\t\t}\r\n\r\n\t\t\tfloat gamma  = iy2_B - y2;\r\n\r\n\t\t\tfloat bot_diff = 0.0f;\r\n\t\t\tfor(int c_i =0 ; c_i< channel; c_i ++ ){\r\n\t\t\t\tfloat temp = 0;\r\n\t\t\t\ttemp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] -\r\n\t\t\t\t\t\tinput1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]);\r\n\t\t\t\ttemp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] -\r\n\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]);\r\n\r\n\t\t\t\tfloat warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i];\r\n\t\t\t\tbot_diff += warped_diff_value * temp  ;\r\n\r\n\r\n\t\t\t}\r\n\t\t\t//the gradients of the x direction/ horizontal direction\r\n\t\t\tgradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;\r\n\r\n\t\t\tgamma = ix2_R- x2;\r\n\t\t\tbot_diff = 0.0f;\r\n\t\t\tfor(int c_i = 0 ; c_i < channel;c_i ++ ){\r\n\t\t\t\tfloat temp = 0.0f;\r\n\t\t\t\ttemp += gamma    * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] -\r\n\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]);\r\n\r\n\t\t\t\ttemp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] -\r\n\t\t\t\t\t\tinput1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]);\r\n\r\n\t\t\t\tfloat warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];\r\n\t\t\t\tbot_diff += warped_diff_value * temp;\r\n\r\n\r\n\t\t\t}\r\n\t\t\tgradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;\r\n\r\n\t\t}\r\n\r\n\r\n\t}\r\n\treturn ;\r\n\r\n}\r\nint InterpolationLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n{\r\n\tint error = -1;\r\n\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_forward\", ([&] {\r\n\r\n\tInterpolationLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),output.data<scalar_t>()\r\n\t\t\t);\r\n\t}));\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\nint InterpolationLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n{\r\n\tint error = -1;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_forward\", ([&] {\r\n\tInterpolationLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),\r\n\t\t\tinput2.data<scalar_t>(),\r\n\t\t\tgradoutput.data<scalar_t>(),\r\n\t\t\tgradinput1.data<scalar_t>(),\r\n\t\t\tgradinput2.data<scalar_t>()\r\n\t\t\t);\r\n\t}));\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpu error in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n"
  },
  {
    "path": "my_package/Interpolation/interpolation_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint InterpolationLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor& input1,\r\n\t\tat::Tensor& input2,\r\n\t\tat::Tensor& output\r\n\r\n\t\t);\r\n\r\nint InterpolationLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor& input1,\r\n\t\tat::Tensor& input2,\r\n\t\tat::Tensor& gradoutput,\r\n\t\tat::Tensor& gradinput1,\r\n\t\tat::Tensor& gradinput2\r\n\t\t);\r\n"
  },
  {
    "path": "my_package/Interpolation/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='interpolation_cuda',\n    ext_modules=[\n        CUDAExtension('interpolation_cuda', [\n            'interpolation_cuda.cc',\n            'interpolation_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/InterpolationCh/InterpolationChLayer.py",
    "content": "# this is for wrapping the customized layer\r\nimport torch\r\nfrom torch.autograd import Function\r\nimport interpolationch_cuda as my_lib\r\n\r\n#Please check how the STN FUNCTION is written :\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py\r\n\r\nclass InterpolationChLayer(Function):\r\n    def __init__(self,ch):\r\n        super(InterpolationChLayer,self).__init__()\r\n        self.ch = ch\r\n\r\n    @staticmethod\r\n    def forward(ctx, input1,input2):\r\n\r\n        assert(input1.is_contiguous())\r\n        assert(input2.is_contiguous())\r\n        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\r\n        # self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?\r\n\r\n        # if input1.is_cuda:\r\n        #     self.device = torch.cuda.current_device()\r\n        # else:\r\n        #     self.device = -1\r\n\r\n        # output =  torch.zeros(input1.size())\r\n\r\n        if input1.is_cuda :\r\n            # output = output.cuda()\r\n            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\r\n            my_lib.InterpolationChLayer_gpu_forward(input1, input2, output)\r\n        else:\r\n            # output = torch.cuda.FloatTensor(input1.data.size())\r\n            output = torch.FloatTensor().resize_(input1.size()).zero_()\r\n            my_lib.InterpolationChLayer_cpu_forward(input1, input2, output)\r\n        ctx.save_for_backward(input1, input2)\r\n        # the function returns the output to its caller\r\n        return output\r\n\r\n    @staticmethod\r\n    def backward(ctx, gradoutput):\r\n        # print(\"Backward of Interpolation Layer\")\r\n        # gradinput1 = input1.new().zero_()\r\n        # gradinput2 = input2.new().zero_()\r\n        # gradinput1 = torch.zeros(self.input1.size())\r\n        # gradinput2 = torch.zeros(self.input2.size())\r\n\r\n        input1, input2 = ctx.saved_tensors\r\n\r\n        if input1.is_cuda:\r\n            # print(\"CUDA backward\")\r\n            # gradinput1 = gradinput1.cuda(self.device)\r\n            # gradinput2 = gradinput2.cuda(self.device)\r\n            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\r\n            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()\r\n            # the input1 image should not require any gradients\r\n            # print(\"Does input1 requires gradients? \" + str(self.input1.requires_grad))\r\n\r\n            err = my_lib.InterpolationChLayer_gpu_backward(input1,input2,gradoutput,gradinput1,gradinput2)\r\n            if err != 0 :\r\n                print(err)\r\n\r\n        else:\r\n            # print(\"CPU backward\")\r\n            # print(gradoutput)\r\n            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()\r\n            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()\r\n\r\n            err = my_lib.InterpolationChLayer_cpu_backward(input1, input2, gradoutput, gradinput1, gradinput2)\r\n            # print(err)\r\n            if err != 0 :\r\n                print(err)\r\n            # print(gradinput1)\r\n            # print(gradinput2)\r\n\r\n        # print(gradinput1)\r\n\r\n        return gradinput1, gradinput2"
  },
  {
    "path": "my_package/InterpolationCh/InterpolationChModule.py",
    "content": "# modules/InterpolationLayer.py\r\nfrom torch.nn import Module\r\nfrom .InterpolationChLayer import InterpolationChLayer\r\n\r\nclass InterpolationChModule(Module):\r\n    def __init__(self,ch):\r\n        super(InterpolationChModule, self).__init__()\r\n        self.ch = ch\r\n        # self.f = InterpolationChLayer(ch)\r\n\r\n    def forward(self, input1, input2):\r\n        return InterpolationChLayer.apply(input1, input2)\r\n\r\n    #we actually dont need to write the backward code for a module, since we have \r\n\r\n"
  },
  {
    "path": "my_package/InterpolationCh/__init__.py",
    "content": "from  .InterpolationChModule import *\n"
  },
  {
    "path": "my_package/InterpolationCh/interpolationch_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"interpolationch_cuda_kernel.cuh\"\r\n\r\n\r\nint InterpolationChLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n\t\t{\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n//\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != 2) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != output.stride(0)) return error;\r\n\tif(input1_c_stride != output.stride(1)) return error;\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n\r\n\terror =InterpolationChLayer_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nint InterpolationChLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n    {\r\n\tint error = 1 ;\r\n\tint channel = input1.size( 1);\r\n//\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != 2) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w) return error;\r\n\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input2_b_stride != gradinput2.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\tif(input2_c_stride != gradinput2.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = InterpolationChLayer_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"InterpolationChLayer_gpu_forward\", &InterpolationChLayer_gpu_forward, \"InterpolationCh forward (CUDA)\");\r\n  m.def(\"InterpolationChLayer_gpu_backward\", &InterpolationChLayer_gpu_backward, \"InterpolationCh backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/InterpolationCh/interpolationch_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"interpolationch_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void InterpolationChLayer_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__ input1,\r\n\t\tconst scalar_t* __restrict__ input2,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\r\n\t\tfloat fx = input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i  ];\r\n\t\tfloat fy = input2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i  ];\r\n\r\n\t\tfloat x2 = (float)(w_i) + fx;\r\n\t\tfloat y2 = (float)(h_i) + fy;\r\n\r\n\t\tif(x2 >= 0.0f && y2 >=0.0f && x2 < (float)w && y2 < (float)h){\r\n\t\t\tint ix2_L = int(x2);\r\n\t\t\tint iy2_T = int(y2);\r\n\t\t\tint ix2_R = min(ix2_L + 1, w - 1);\r\n\t\t\tint iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n\t\t\tfloat alpha = x2 - ix2_L;\r\n\t\t\tfloat beta = y2 - iy2_T;\r\n\r\n\t\t\tfor(int c_i = 0 ; c_i < channel ; c_i ++){\r\n\t\t\t\tfloat TL = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L];\r\n\t\t\t\tfloat TR = input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R];\r\n\t\t\t\tfloat BL = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L];\r\n\t\t\t\tfloat BR = input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R];\r\n\t\t\t\toutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] =\r\n\t\t\t\t\t(1- alpha ) *(1-beta) *TL + alpha *(1- beta) * TR + (1-alpha) *beta *BL + alpha *beta * BR;\r\n\t\t\t}\r\n\t\t} else{\r\n\t\t\t//the warping data is out of range, we fill it with zeros\r\n\t\t\tfor(int c_i = 0 ;  c_i < channel; c_i ++){\r\n\t\t\t\toutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i] = fillvalue;\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void InterpolationChLayer_gpu_backward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\r\n\t\tconst scalar_t* __restrict__  input2,\r\n\t\tconst scalar_t* __restrict__  gradoutput,\r\n\t\tscalar_t*  gradinput1,\r\n\t\tscalar_t*  gradinput2\r\n\t\t)\r\n{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off  = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n\r\n\t\tfloat fx= input2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i ];\r\n\t\tfloat fy = input2[batch_i * input2_b_stride + 1* input2_c_stride + h_i * input2_h_stride + w_i];\r\n\r\n\t\tfloat x2 = float(w_i) + fx;\r\n\t\tfloat y2 = float(h_i) + fy;\r\n\r\n\t\tif(x2 >= 0.0f  && y2 >= 0.0f && x2 < (float)w && y2 < (float)h){\r\n\t\t\tint ix2_L = int(x2);\r\n\t\t\tint iy2_T = int(y2);\r\n\r\n\t\t\tint ix2_R  = min(ix2_L+ 1, w - 1);\r\n\t\t\tint iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n\t\t\tfloat alpha = x2 - ix2_L;\r\n\t\t\tfloat beta = y2 - iy2_T;\r\n\r\n\t\t\tfor (int c_i = 0 ; c_i < channel; c_i++){\r\n\t\t\t\tfloat gradoutput_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];\r\n\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L], gradoutput_value * ( 1- alpha) * (1- beta));\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_R], gradoutput_value * alpha * (1-beta));\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L], gradoutput_value * (1-alpha ) * beta);\r\n\t\t\t\tatomicAdd( & gradinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R], gradoutput_value * alpha * beta);\r\n\r\n\t\t\t}\r\n\r\n\t\t\tfloat gamma  = iy2_B - y2;\r\n\r\n\t\t\tfloat bot_diff = 0.0f;\r\n\t\t\tfor(int c_i =0 ; c_i< channel; c_i ++ ){\r\n\t\t\t\tfloat temp = 0;\r\n\t\t\t\ttemp += gamma * (input1[off + c_i * input1_c_stride + iy2_T * input1_h_stride +ix2_R] -\r\n\t\t\t\t\t\tinput1[off + c_i* input1_c_stride+ iy2_T * input1_h_stride + ix2_L]);\r\n\t\t\t\ttemp += (1 - gamma) *( input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_R] -\r\n\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L]);\r\n\r\n\t\t\t\tfloat warped_diff_value = gradoutput[off+ c_i * input1_c_stride+ h_i* input1_h_stride + w_i];\r\n\t\t\t\tbot_diff += warped_diff_value * temp  ;\r\n\r\n\r\n\t\t\t}\r\n\t\t\t//the gradients of the x direction/ horizontal direction\r\n\t\t\tgradinput2[batch_i * input2_b_stride + 0 * input2_c_stride + h_i * input2_h_stride + w_i] = bot_diff;\r\n\r\n\t\t\tgamma = ix2_R- x2;\r\n\t\t\tbot_diff = 0.0f;\r\n\t\t\tfor(int c_i = 0 ; c_i < channel;c_i ++ ){\r\n\t\t\t\tfloat temp = 0.0f;\r\n\t\t\t\ttemp += gamma    * (input1[off + c_i * input1_c_stride + iy2_B * input1_h_stride + ix2_L] -\r\n\t\t\t\t\t\tinput1[off + c_i * input1_c_stride + iy2_T * input1_h_stride + ix2_L]);\r\n\r\n\t\t\t\ttemp += (1-gamma) *( input1[off + c_i * input1_c_stride+ iy2_B* input1_h_stride+ix2_R] -\r\n\t\t\t\t\t\tinput1[off+ c_i* input1_c_stride+ iy2_T * input1_h_stride +ix2_R]);\r\n\r\n\t\t\t\tfloat warped_diff_value = gradoutput[off + c_i * input1_c_stride + h_i * input1_h_stride + w_i];\r\n\t\t\t\tbot_diff += warped_diff_value * temp;\r\n\r\n\r\n\t\t\t}\r\n\t\t\tgradinput2[batch_i * input2_b_stride + 1 * input2_c_stride + h_i * input2_h_stride + w_i]= bot_diff;\r\n\r\n\t\t}\r\n\r\n\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\n\r\nint InterpolationChLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"InterpolationChLayer_gpu_forward_kernelfunc\", ([&] {\r\n\tInterpolationChLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),output.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\nint InterpolationChLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\t\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"InterpolationChLayer_gpu_backward_kernelfunc\", ([&] {\r\nInterpolationChLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),\r\n\t\t\tinput2.data<scalar_t>(),\r\n\t\t\tgradoutput.data<scalar_t>(),\r\n\t\t\tgradinput1.data<scalar_t>(),\r\n\t\t\tgradinput2.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpu error in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n"
  },
  {
    "path": "my_package/InterpolationCh/interpolationch_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\n\r\nint InterpolationChLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor& input1,\r\n\t\tat::Tensor& input2,\r\n\t\tat::Tensor& output\r\n\r\n\t\t);\r\n \r\nint InterpolationChLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\r\n\t\tat::Tensor& input1,\r\n\t\tat::Tensor& input2,\r\n\t\tat::Tensor& gradoutput,\r\n\t\tat::Tensor& gradinput1,\r\n\t\tat::Tensor& gradinput2\r\n\t\t);\r\n"
  },
  {
    "path": "my_package/InterpolationCh/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='interpolationch_cuda',\n    ext_modules=[\n        CUDAExtension('interpolationch_cuda', [\n            'interpolationch_cuda.cc',\n            'interpolationch_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/MinDepthFlowProjection/__init__.py",
    "content": "from  .minDepthFlowProjectionModule import *\n"
  },
  {
    "path": "my_package/MinDepthFlowProjection/minDepthFlowProjectionLayer.py",
    "content": "# this is for wrapping the customized layer\nimport torch\nfrom torch.autograd import Function\n#import _ext.my_lib as my_lib\nimport mindepthflowprojection_cuda as my_lib\n\nclass minDepthFlowProjectionLayer(Function):\n    def __init__(self,requires_grad):\n        super(minDepthFlowProjectionLayer,self).__init__()\n        # self.requires_grad = requires_grad\n\n    @staticmethod\n    def forward(ctx, input1, input2, requires_grad):\n        # print(\"Depth Aware Flow Projection\")\n        assert(input1.is_contiguous())\n        assert(input2.is_contiguous())\n        # self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\n        # self.input2 = input2.contiguous()\n        fillhole = 1 if requires_grad == False else 0\n        # if input1.is_cuda:\n        #     self.device = torch.cuda.current_device()\n        # else:\n        #     self.device = -1\n\n        # count = torch.zeros(input1.size(0),1,input1.size(2),input1.size(3)) # for accumulating the homography projections\n        # output = torch.zeros(input1.size())\n\n        if input1.is_cuda:\n            # output = output.cuda()\n            # count = count.cuda()\n            # print(\"correct\")\n            count = torch.cuda.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()\n            output = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.minDepthFlowProjectionLayer_gpu_forward(input1,input2, count,output, fillhole)\n        else:\n            # output = torch.cuda.FloatTensor(input1.data.size())\n            count = torch.FloatTensor().resize_(input1.size(0), 1, input1.size(2), input1.size(3)).zero_()\n            output = torch.FloatTensor().resize_(input1.size()).zero_()\n            err = my_lib.minDepthFlowProjectionLayer_cpu_forward(input1,input2, count, output,fillhole)\n        if err != 0:\n            print(err)\n        # output = output/count # to divide the counter\n\n        # self.count = count #to keep this\n        # self.output = output\n\n        ctx.save_for_backward(input1, input2,count,output)\n        ctx.fillhole = fillhole\n\n        # print(self.input1[0, 0, :10, :10])\n        # print(self.count[0, 0, :10, :10])\n        # print(self.input1[0, 0, -10:, -10:])\n        # print(self.count[0, 0, -10:, -10:])\n\n        # the function returns the output to its caller\n        return output\n\n    @staticmethod\n    def backward(ctx, gradoutput):\n        # print(\"Backward of Filter Interpolation Layer\")\n        # gradinput1 = input1.new().zero_()\n        # gradinput2 = input2.new().zero_()\n        # gradinput1 = torch.zeros(self.input1.size())\n\n        input1, input2, count, output = ctx.saved_tensors\n        # fillhole = ctx.fillhole\n\n        if input1.is_cuda:\n            # print(\"CUDA backward\")\n            # gradinput1 = gradinput1.cuda(self.device)\n            gradinput1 = torch.cuda.FloatTensor().resize_(input1.size()).zero_()\n            gradinput2 = torch.cuda.FloatTensor().resize_(input2.size()).zero_()\n\n            err = my_lib.minDepthFlowProjectionLayer_gpu_backward(input1,input2,\n                                                               count, output,\n                                                               gradoutput, gradinput1,gradinput2)\n            # print(err)\n            if err != 0 :\n                print(err)\n\n        else:\n            # print(\"CPU backward\")\n            # print(gradoutput)\n            gradinput1 = torch.FloatTensor().resize_(input1.size()).zero_()\n            gradinput2 = torch.FloatTensor().resize_(input2.size()).zero_()\n            err = my_lib.minDepthFlowProjectionLayer_cpu_backward(input1, input2,\n                                                               count, output,\n                                                               gradoutput, gradinput1,gradinput2)\n            # print(err)\n            if err != 0:\n                print(err)\n            # print(gradinput1)\n            # print(gradinput2)\n\n        # print(gradinput1)\n\n        return gradinput1,gradinput2,None\n"
  },
  {
    "path": "my_package/MinDepthFlowProjection/minDepthFlowProjectionModule.py",
    "content": "# modules/FlowProjectionModule.py\nfrom torch.nn.modules.module import Module\nfrom .minDepthFlowProjectionLayer import minDepthFlowProjectionLayer #, FlowFillholeLayer\n\n__all__ =['minDepthFlowProjectionModule']\n\nclass minDepthFlowProjectionModule(Module):\n    def __init__(self, requires_grad = True):\n        super(minDepthFlowProjectionModule, self).__init__()\n        self.requires_grad = requires_grad\n        # self.f = minDepthFlowProjectionLayer(requires_grad)\n\n    def forward(self, input1, input2):\n        return minDepthFlowProjectionLayer.apply(input1, input2,self.requires_grad)\n\n# class FlowFillholeModule(Module):\n#     def __init__(self,hole_value = -10000.0):\n#         super(FlowFillholeModule, self).__init__()\n#         self.f = FlowFillholeLayer()\n#\n#     def forward(self, input1):\n#         return self.f(input1)\n\n    #we actually dont need to write the backward code for a module, since we have\n\n"
  },
  {
    "path": "my_package/MinDepthFlowProjection/mindepthflowprojection_cuda.cc",
    "content": "#include <torch/extension.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"mindepthflowprojection_cuda_kernel.cuh\"\r\n\r\n\r\nint minDepthFlowProjectionLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n        at::Tensor&  input2,\r\n        at::Tensor&  count,\r\n\t\tat::Tensor&  output,\r\n\t\tint fillhole\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n\tif(channel!= 2) return error;\r\n\tint batch = input1.size(0);\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\r\n    if(input2.size(1) !=1 ) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n\tint count_b_stride = count.stride(0);\r\n\tint count_c_stride = count.stride(1);\r\n\tint count_h_stride = count.stride(2);\r\n\tint count_w_stride = count.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != output.stride(0)) return error;\r\n\tif(input1_c_stride != output.stride(1)) return error;\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n//    printf(\"In gpu forward\\n\");\r\n\terror = minDepthFlowProjection_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch,fillhole,\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tcount,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\nint minDepthFlowProjectionLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n        at::Tensor&  count,\r\n\t\tat::Tensor&  output,\r\n        at::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=2) return error;\r\n\tint batch = input1.size(0);\r\n\tif(count.size( 0) != batch) return error;\r\n\tif(count.size(1) != 1) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n    if(input2.size(1) !=1 ) return error;\r\n    if(count.size(2) != h) return error;// to add some checkpoint\r\n\tif(count.size(3) != w) return error;\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n\tint count_b_stride = count.stride(0);\r\n\tint count_c_stride = count.stride(1);\r\n\tint count_h_stride = count.stride(2);\r\n\tint count_w_stride = count.stride(3);\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\t//if(w_stride !=1) return error;\r\n\tif(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", count_b_stride,count_c_stride,count_h_stride,count_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = minDepthFlowProjection_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1,\r\n            input2,\r\n            count,\r\n            output,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\t  //printf(\"Am I good in backward function %d\",error);\r\n\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"minDepthFlowProjectionLayer_gpu_forward\", &minDepthFlowProjectionLayer_gpu_forward, \"minDepthFlowProjection forward (CUDA)\");\r\n  m.def(\"minDepthFlowProjectionLayer_gpu_backward\", &minDepthFlowProjectionLayer_gpu_backward, \"minDepthFlowProjection backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"mindepthflowprojection_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void minDepthFlowProjection_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t* count,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n        float fx = input1[ off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n        float fy = input1[ off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ];\r\n\r\n        float x2 = (float) (w_i) + fx;\r\n        float y2 = (float) (h_i) + fy;\r\n        if(x2>=0.0f && y2 >= 0.0f &&x2 <= (float) ( w-1) && y2 <= (float) (h -1 ) ){\r\n            int ix2_L = (int) (x2);\r\n            int iy2_T = (int) (y2);\r\n            int ix2_R = min(ix2_L + 1, w - 1);\r\n            int iy2_B = min(iy2_T + 1, h - 1);\r\n\r\n            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];\r\n            float old_exist = 0;\r\n\r\n            //while(1){\r\n            old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L];\r\n            if(temp > old_exist){\r\n                output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] = -  fx; //update the new vector\r\n                output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L]  = -  fy;\r\n                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] =   temp; // update to the best weight\r\n                //if ( count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] == temp){\r\n                //break;\r\n                //}\r\n            }\r\n            //}\r\n\r\n           // old_exist = count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] ;\r\n           // if(temp > old_exist){\r\n            //    output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ]= - fx;\r\n            //    output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] = - fy;\r\n            //    count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R]= temp ;\r\n           // }\r\n\r\n           // old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L];\r\n           // if(temp > old_exist){\r\n            //    output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] = - fx;\r\n           //     output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L]  = - fy;\r\n           //     count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L]  = temp;\r\n           // }\r\n\r\n           // old_exist = count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R];\r\n           // if(temp> old_exist){\r\n            //    output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] = - fx;\r\n            //    output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R]  = - fy;\r\n            //    count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] = temp;\r\n           // }\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void minDepthFlowFillhole_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t*  count,\r\n\t\tscalar_t* output\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\t    float temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + w_i] ;\r\n        if(temp <= 0.0f){\r\n            //search along the four directions,0/90/180/270, until finding at least one\r\n            int left_offset = w_i;            float left_temp = 0.0f;\r\n            while(left_temp == 0.0f && left_offset - 1 >= 0){\r\n                left_offset = left_offset - 1;\r\n                left_temp = count[batch_i * count_b_stride + 0 + h_i * count_h_stride + left_offset] ;\r\n            }\r\n\r\n            int right_offset = w_i ;            float right_temp = 0.0f;\r\n            while(right_temp ==0.0f && right_offset + 1 <= w - 1 ){\r\n                right_offset  = right_offset + 1 ;\r\n                right_temp =  count[batch_i * count_b_stride + 0 + h_i * count_h_stride + right_offset] ;\r\n            }\r\n\r\n            int up_offset = h_i ;            float up_temp = 0.0f;\r\n            while(up_temp == 0.0f && up_offset - 1 >=0){\r\n                up_offset = up_offset - 1;\r\n                up_temp =  count[batch_i * count_b_stride + 0 + up_offset * count_h_stride + w_i ] ;\r\n            }\r\n\r\n            int down_offset = h_i;            float down_temp = 0.0f;\r\n            while(down_temp == 0.0f && down_offset + 1 <= h - 1 ){\r\n                down_offset = down_offset + 1;\r\n                down_temp =  count[batch_i * count_b_stride + 0 + down_offset * count_h_stride + w_i] ;\r\n            }\r\n\r\n            if(left_temp + right_temp + up_temp + down_temp <=0.0f){\r\n                //printf(\"Can't fill hole, find no neighbor vectors availabel\\n\");\r\n                return;\r\n            }\r\n\r\n            left_temp = (left_temp > 0.0f)?1:0;\r\n            right_temp = (right_temp > 0.0f)?1:0;\r\n            up_temp = (up_temp > 0.0f)?1:0;\r\n            down_temp = (down_temp > 0.0f)?1:0;\r\n\r\n            output[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i ] = (\r\n                left_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + left_offset] +\r\n                right_temp *  output[off + 0 * input1_c_stride + h_i * input1_h_stride + right_offset]+\r\n                up_temp *  output[off + 0 * input1_c_stride + up_offset * input1_h_stride + w_i] +\r\n                down_temp *  output[off + 0 * input1_c_stride + down_offset * input1_h_stride + w_i]\r\n            )/(\r\n                left_temp + right_temp + up_temp + down_temp\r\n            ) ;\r\n\r\n\r\n            output[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i ] =(\r\n                left_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + left_offset] +\r\n                right_temp *  output[off + 1 * input1_c_stride + h_i * input1_h_stride + right_offset]+\r\n                up_temp *  output[off + 1 * input1_c_stride + up_offset * input1_h_stride + w_i] +\r\n                down_temp *  output[off + 1 * input1_c_stride + down_offset * input1_h_stride + w_i]\r\n            )/(\r\n                left_temp + right_temp + up_temp + down_temp\r\n            ) ;\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void minDepthFlowProjection_gpu_backward_kernelfunc(\r\n\t\tconst int nElement,  \tconst int w, \tconst int h, const int channel,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,\tconst scalar_t* __restrict__  input2,\r\n\t\tscalar_t*  count,\r\n\t\tscalar_t* output,\r\n\t\tconst scalar_t* __restrict__  gradoutput,\r\n\t\tscalar_t*  gradinput1,\r\n\t\tscalar_t*  gradinput2\r\n\t\t)\r\n{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w;\r\n\tconst bool withinYbounds = h_i < h;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\tconst int off  = batch_i * input1_b_stride;\r\n\r\n\t//    __syncthreads();\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n        float fx = input1[off + 0 * input1_c_stride + h_i * input1_h_stride + w_i] ;\r\n        float fy = input1[off + 1 * input1_c_stride + h_i * input1_h_stride + w_i] ;\r\n\r\n        float x2 = (float) ( w_i ) + fx;\r\n        float y2 = (float) ( h_i ) + fy;\r\n        if( x2 >=0.0f && y2 >= 0.0f && x2 <= (float) (w -1) && y2 <= (float) (h-1)){\r\n            int ix2_L = (int)(x2);\r\n            int iy2_T = (int)(y2);\r\n            int ix2_R  = min(ix2_L + 1, w-1);\r\n            int iy2_B  = min(iy2_T + 1, h-1);\r\n\r\n            float temp = input2[batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i];\r\n\r\n            int iu_offset = off + 0 * input1_c_stride + h_i * input1_h_stride + w_i;\r\n                        int iv_offset = off + 1 * input1_c_stride + h_i * input1_h_stride + w_i;\r\n            if(temp == count[batch_i * count_b_stride + 0+ iy2_T * count_h_stride + ix2_L] ){\r\n                gradinput1[iu_offset] += - gradoutput[off +  0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L];\r\n                gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] ;\r\n            }\r\n            if(temp == count[batch_i * count_b_stride +0 + iy2_T * count_h_stride  + ix2_R] ){\r\n                gradinput1[iu_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] ;\r\n                gradinput1[iv_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] ;\r\n            }\r\n            if(temp==count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] ){\r\n                gradinput1[iu_offset ] += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ;\r\n                gradinput1[iv_offset]  += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] ;\r\n            }\r\n            if(temp == count[batch_i * count_b_stride + 0+ iy2_B * count_h_stride + ix2_R] ){\r\n                gradinput1[iu_offset ]  += -  gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] ;\r\n                gradinput1[iv_offset]   += -  gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R];\r\n            }\r\n\r\n\r\n            //int weight_offset = batch_i * input2_b_stride + 0 + h_i * input2_h_stride + w_i;\r\n            //gradinput2[weight_offset] += - gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *\r\n            //                                (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );\r\n            //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *\r\n            //                                (fx - output[off + 0 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );\r\n            //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *\r\n            //                                (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );\r\n            //gradinput2[weight_offset] += -gradoutput[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *\r\n            //                                (fx - output[off + 0 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );\r\n\r\n            //gradinput2[weight_offset] += - gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_L] *\r\n            //                                (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_L ] );\r\n            //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_T * count_h_stride + ix2_R] *\r\n            //                                (fy - output[off + 1 * input1_c_stride + iy2_T * input1_h_stride + ix2_R ] );\r\n            //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_L] *\r\n            //                                (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_L ] );\r\n            //gradinput2[weight_offset] += -gradoutput[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R] /\r\n            //                                count[batch_i * count_b_stride + 0 + iy2_B * count_h_stride + ix2_R] *\r\n             //                               (fy - output[off + 1 * input1_c_stride + iy2_B * input1_h_stride + ix2_R ] );\r\n        }\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\n\r\nint minDepthFlowProjection_gpu_forward_kernel(\r\n\t\tcudaStream_t stream, \t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int fillhole,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\tat::Tensor&  input2,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  output\r\n\t\t)\r\n{\r\n    int error = -1;\r\n\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n//    printf(\"I am here\\n\");\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"minDepthFlowProjection_gpu_forward\", ([&] {\r\n\r\n\tminDepthFlowProjection_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n\t\t\t);\r\n\t\t\t\r\n\t}));\r\n\t\t\t\r\n    cudaError_t err = cudaGetLastError();\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"I am there\\n\");\r\n\r\n\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n    err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"I am dd\\n\");\r\n\r\n    if(fillhole){\r\n\r\n//        printf(\"use flow fill hole\\n\");\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"minDepthFlowFillhole\", ([&] {\r\n\r\n        minDepthFlowFillhole_kernelfunc<<<grid,block,0,stream>>>(\r\n    \t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n            input2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>()\r\n        );\r\n\t\t}));\r\n\r\n    err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\treturn error;\r\n\t}\r\n\r\n    }\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\nint minDepthFlowProjection_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  count,        at::Tensor&  output,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t)\r\n{\r\n\r\n\tint error = -1;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w + BLOCKDIMX - 1)/ BLOCKDIMX, (h + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"minDepthFlowProjection_gpu_backward\", ([&] {\r\n\r\n\tminDepthFlowProjection_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n            count_b_stride,count_c_stride,count_h_stride,count_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),count.data<scalar_t>(),output.data<scalar_t>(),\r\n\t\t\tgradoutput.data<scalar_t>(), gradinput1.data<scalar_t>(), gradinput2.data<scalar_t>()\r\n\t\t\t);\r\n\t\t\t\t\t}));\r\n\r\n//    printf(\"gpu I am there\\n\");\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpu error in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n//    printf(\"gpu I am here\\n\");\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n\r\n}"
  },
  {
    "path": "my_package/MinDepthFlowProjection/mindepthflowprojection_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint minDepthFlowProjection_gpu_forward_kernel(\r\n\t\tcudaStream_t stream, \t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int fillhole,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  count,\r\n\t\tat::Tensor&  output\r\n\r\n\t\t);\r\n\r\nint minDepthFlowProjection_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,\r\n\t\tconst int h,\r\n\t\tconst int channel,\r\n\t\tconst int batch,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n        const int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n        const int count_b_stride, const int count_c_stride, const int count_h_stride, const int count_w_stride,\r\n\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n        at::Tensor&  count,\r\n        at::Tensor&  output,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2\r\n\t\t);\r\n"
  },
  {
    "path": "my_package/MinDepthFlowProjection/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='mindepthflowprojection_cuda',\n    ext_modules=[\n        CUDAExtension('mindepthflowprojection_cuda', [\n            'mindepthflowprojection_cuda.cc',\n            'mindepthflowprojection_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/SeparableConv/SeparableConvLayer.py",
    "content": "# this is for wrapping the customized layer\r\nimport torch\r\nfrom torch.autograd import Function\r\nimport _ext.my_lib as my_lib\r\n\r\n#Please check how the STN FUNCTION is written :\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py\r\n\r\nclass SeparableConvLayer(Function):\r\n    def __init__(self,filtersize):\r\n        self.filtersize = filtersize\r\n        super(SeparableConvLayer,self).__init__()\r\n\r\n    def forward(self, input1,input2,input3):\r\n        intBatches = input1.size(0)\r\n        intInputDepth = input1.size(1)\r\n        intInputHeight = input1.size(2)\r\n        intInputWidth = input1.size(3)\r\n        intFilterSize = min(input2.size(1), input3.size(1))\r\n        intOutputHeight = min(input2.size(2), input3.size(2))\r\n        intOutputWidth = min(input2.size(3), input3.size(3))\r\n\r\n        assert(intInputHeight - self.filtersize == intOutputHeight - 1)\r\n        assert(intInputWidth - self.filtersize == intOutputWidth - 1)\r\n        assert(intFilterSize == self.filtersize)\r\n\r\n        assert(input1.is_contiguous() == True)\r\n        assert(input2.is_contiguous() == True)\r\n        assert(input3.is_contiguous() == True)\r\n\r\n        output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_()\r\n\r\n        # assert(input1.is_contiguous())\r\n        # assert(input2.is_contiguous())\r\n        self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\r\n        self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?\r\n        self.input3 = input3.contiguous()\r\n        if input1.is_cuda:\r\n            self.device = torch.cuda.current_device()\r\n        else:\r\n            self.device = -1\r\n\r\n        if input1.is_cuda :\r\n            output = output.cuda()\r\n            err = my_lib.SeparableConvLayer_gpu_forward(input1, input2,input3, output)\r\n\r\n        else:\r\n            # output = torch.cuda.FloatTensor(input1.data.size())\r\n            err = my_lib.SeparableConvLayer_cpu_forward(input1, input2,input3, output)\r\n        if err != 0:\r\n            print(err)\r\n        # the function returns the output to its caller\r\n        return output\r\n\r\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\r\n    def backward(self, gradoutput):\r\n        # print(\"Backward of Interpolation Layer\")\r\n        # gradinput1 = input1.new().zero_()\r\n        # gradinput2 = input2.new().zero_()\r\n        gradinput1 = torch.zeros(self.input1.size())\r\n        gradinput2 = torch.zeros(self.input2.size())\r\n        gradinput3 = torch.zeros(self.input3.size())\r\n        if self.input1.is_cuda:\r\n            # print(\"CUDA backward\")\r\n            gradinput1 = gradinput1.cuda(self.device)\r\n            gradinput2 = gradinput2.cuda(self.device)\r\n            gradinput3 = gradinput3.cuda(self.device)\r\n\r\n            # the input1 image should not require any gradients\r\n            # print(\"Does input1 requires gradients? \" + str(self.input1.requires_grad))\r\n\r\n            err = my_lib.SeparableConvLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)\r\n            if err != 0 :\r\n                print(err)\r\n\r\n        else:\r\n            # print(\"CPU backward\")\r\n            # print(gradoutput)\r\n            err = my_lib.SeparableConvLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)\r\n            # print(err)\r\n            if err != 0 :\r\n                print(err)\r\n            # print(gradinput1)\r\n            # print(gradinput2)\r\n\r\n        # print(gradinput1)\r\n\r\n        return gradinput1, gradinput2,gradinput3"
  },
  {
    "path": "my_package/SeparableConv/SeparableConvModule.py",
    "content": "# modules/InterpolationLayer.py\r\nfrom torch.nn import Module\r\nfrom functions.SeparableConvLayer import SeparableConvLayer\r\n\r\nclass SeparableConvModule(Module):\r\n    def __init__(self,filtersize):\r\n        super(SeparableConvModule, self).__init__()\r\n        self.f = SeparableConvLayer(filtersize)\r\n\r\n    def forward(self, input1, input2, input3):\r\n        return self.f(input1, input2, input3)\r\n\r\n    #we actually dont need to write the backward code for a module, since we have \r\n\r\n"
  },
  {
    "path": "my_package/SeparableConv/__init__.py",
    "content": "from  .SeparableConvModule import *\n"
  },
  {
    "path": "my_package/SeparableConv/separableconv_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"separableconv_cuda_kernel.cuh\"\r\n\r\n\r\nint SeparableConvLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  input3,\r\n\t\tat::Tensor&  output\r\n\r\n\t\t)\r\n\t\t{\r\n\tint error = 1 ;\r\n\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != input3.size(1)) return error; //change by zhenghe, am I right?\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w - input2.size(1) + 1) return error;\r\n\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n    int input3_b_stride = input3.stride(0);\r\n\tint input3_c_stride = input3.stride(1);\r\n\tint input3_h_stride = input3.stride(2);\r\n\tint input3_w_stride = input3.stride(3);\r\n\r\n    int output_b_stride = output.stride(0);\r\n\tint output_c_stride = output.stride(1);\r\n\tint output_h_stride = output.stride(2);\r\n\tint output_w_stride = output.stride(3);\r\n//    printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n    if(input1_w_stride !=1) return error;\r\n\tif(input2_w_stride !=1) return error;\r\n    if(input3_w_stride !=1) return error;\r\n    if(output_w_stride !=1) return error;\r\n\r\n\tif(input2_b_stride != input3_b_stride) return error;\r\n\tif(input2_c_stride != input3_c_stride) return error;\r\n\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, output);\r\n\r\n\r\n\terror = SeparableConvLayer_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch,  input2.size(1),\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t\toutput_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\r\n\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tinput3,\r\n\t\t\toutput);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n\r\n\t\t}\r\nint SeparableConvLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  input3,\r\n\t\tat::Tensor&  gradoutput,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2,\r\n\t\tat::Tensor&  gradinput3\r\n\t\t)\r\n\t\t{\r\n\r\n\r\n    int error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != input2.size(1)) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w - input2.size(1) + 1) return error;\r\n\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n    int input3_b_stride = input3.stride(0);\r\n\tint input3_c_stride = input3.stride(1);\r\n\tint input3_h_stride = input3.stride(2);\r\n\tint input3_w_stride = input3.stride(3);\r\n\r\n    int output_b_stride = gradoutput.stride(0);\r\n\tint output_c_stride = gradoutput.stride(1);\r\n\tint output_h_stride = gradoutput.stride(2);\r\n\tint output_w_stride = gradoutput.stride(3);\r\n\r\n//    printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\tif(input1_w_stride !=1) return error;\r\n\tif(input2_w_stride !=1) return error;\r\n    if(input3_w_stride !=1) return error;\r\n    if(output_w_stride !=1) return error;\r\n\r\n    if(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input2_b_stride != gradinput2.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\tif(input2_c_stride != gradinput2.stride(1)) return error;\r\n\tif(input3_c_stride != gradinput3.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  THCudaTensor_nElement(state, gradoutput);\r\n\r\n\terror  = SeparableConvLayer_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,  input2.size(1),\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t\toutput_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tinput3,\r\n\t\t\tgradoutput,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2,\r\n\t\t\tgradinput3\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n}\r\n\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"SeparableConvLayer_gpu_forward\", &SeparableConvLayer_gpu_forward, \"SeparableConv forward (CUDA)\");\r\n  m.def(\"SeparableConvLayer_gpu_backward\", &SeparableConvLayer_gpu_backward, \"SeparableConv backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/SeparableConv/separableconv_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"separableconv_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void SeparableConvLayer_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,    \t\tconst scalar_t* __restrict__  input2,    \tconst scalar_t* __restrict__  input3, \tscalar_t*  output\r\n\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w - filter_size + 1;\r\n\tconst bool withinYbounds = h_i < h - filter_size + 1;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n\r\n\t\tfor ( int c_i = 0 ; c_i < channel ; c_i ++){\r\n\r\n\t\t\tfloat out = 0.0f;\r\n\t\t\tfor (int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {\r\n\t\t\tfor (int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {\r\n\t\t\t\tfloat temp1 = input1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)];\r\n\t\t\t\tfloat temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];\r\n\t\t\t\tfloat temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];\r\n\t\t\t\tout += temp1* temp2 * temp3;\r\n\t\t\t}\r\n\t\t\t}\r\n\t\t\toutput[batch_i * output_b_stride + c_i* output_c_stride + h_i * output_h_stride + w_i ] = out;\r\n\t\t}\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n \r\n\r\ntemplate <typename scalar_t>\r\n__global__ void SeparableConvLayer_gpu_backward_kernelfunc(\r\n\t\tconst int nElement, \t   const int w, \t\tconst int h, \t\tconst int channel, const int filter_size,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__  input1,        \t\tconst scalar_t* __restrict__  input2,\t\tconst scalar_t* __restrict__  input3,\r\n\t\tconst scalar_t* __restrict__  gradoutput,    \t\tscalar_t*  gradinput1,  \t\tscalar_t*  gradinput2,  \t\tscalar_t* gradinput3\r\n\t\t)\r\n\t\t{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w - filter_size + 1;\r\n\tconst bool withinYbounds = h_i < h - filter_size + 1;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n\r\n\t\tfor (int c_i = 0 ; c_i < channel ; c_i ++){\r\n\t\t\t\tfor (int   intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {\r\n\t\t\t\tfor ( int  intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {\r\n\t\t\t\t\tfloat temp1 = input1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)];\r\n\t\t\t\t\tfloat temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];\r\n\t\t\t\t\tfloat temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];\r\n\r\n\t\t\t\t\tfloat gradout = gradoutput[batch_i * output_b_stride + c_i* output_c_stride + h_i * output_h_stride + w_i ];\r\n\r\n\t\t\t\t\tatomicAdd(&gradinput1[batch_i * input1_b_stride + c_i * input1_c_stride + (h_i + intFilterY )* input1_h_stride + (w_i + intFilterX)],\r\n\t\t\t\t\t\tgradout * temp2 * temp3);\r\n\t\t\t\t\tatomicAdd(&gradinput2[batch_i * input2_b_stride + intFilterY * input2_c_stride  +  h_i * input2_h_stride + w_i ],\r\n\t\t\t\t\t\tgradout * temp1 * temp3);\r\n\t\t\t\t\tatomicAdd(&gradinput3 [batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ] ,\r\n\t\t\t\t\t\tgradout * temp1 * temp2);\r\n\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t}\r\n\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\n\r\nint SeparableConvLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch,const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\r\n\t\tat::Tensor&  input1,    \t\tat::Tensor&  input2,    \tat::Tensor&  input3, \tat::Tensor&  output\r\n\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w  - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1 + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\t\t\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\nSeparableConvLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel, filter_size,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t\toutput_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),input3.data<scalar_t>(), output.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateOutput: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nint SeparableConvLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,    \t\tconst int h,    \t\tconst int channel,  \t\tconst int batch, const int filter_size,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\r\n\t\tat::Tensor&  input1,        \t\tat::Tensor&  input2,\t\tat::Tensor&  input3,\r\n\r\n\t\tat::Tensor&  gradoutput,    \t\tat::Tensor&  gradinput1,  \t\tat::Tensor&  gradinput2,  \t\tat::Tensor&  gradinput3\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1+ BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\r\n//    cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));\r\n//    cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));\r\n//    cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));\r\n\r\n\t\t\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\nSeparableConvLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel, filter_size,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t\toutput_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\r\n\r\n\t\t\tinput1.data<scalar_t>(), \t\t\tinput2.data<scalar_t>(),         input3.data<scalar_t>(),  \t\t\tgradoutput.data<scalar_t>(),\r\n\t\t\tgradinput1.data<scalar_t>(), \t\t\tgradinput2.data<scalar_t>(),     gradinput3.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}"
  },
  {
    "path": "my_package/SeparableConv/separableconv_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint SeparableConvLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\r\n\t\tat::Tensor& input1,    \t\tat::Tensor& input2,    \tat::Tensor& input3, \tat::Tensor& output\r\n\r\n\t\t);\r\n\r\nint SeparableConvLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,    \t\tconst int h,    \t\tconst int channel,  \t\tconst int batch, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\r\n\t\tat::Tensor& input1,        \t\tat::Tensor& input2,\t\tat::Tensor& input3,\r\n\r\n\t\tat::Tensor& gradoutput,    \t\tat::Tensor& gradinput1,  \t\tat::Tensor& gradinput2,  \t\tat::Tensor& gradinput3\r\n\t\t);\r\n"
  },
  {
    "path": "my_package/SeparableConv/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='separableconv_cuda',\n    ext_modules=[\n        CUDAExtension('separableconv_cuda', [\n            'separableconv_cuda.cc',\n            'separableconv_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/SeparableConvFlow/SeparableConvFlowLayer.py",
    "content": "# this is for wrapping the customized layer\r\nimport torch\r\nfrom torch.autograd import Function\r\nimport separableconvflow_cuda as my_lib\r\nimport warnings\r\n#Please check how the STN FUNCTION is written :\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/gridgen.py\r\n#https://github.com/fxia22/stn.pytorch/blob/master/script/functions/stn.py\r\n\r\nclass SeparableConvFlowLayer(Function):\r\n    def __init__(self,filtersize):\r\n        self.filtersize = filtersize\r\n        warnings.warn(\"\\nSeparable Conv Flow Layer is not precise enough for optical flow due to a divison operation\")\r\n        super(SeparableConvFlowLayer,self).__init__()\r\n\r\n    def forward(self, input1,input2,input3):\r\n        intBatches = input1.size(0)\r\n        intInputDepth = input1.size(1)\r\n        intInputHeight = input1.size(2)\r\n        intInputWidth = input1.size(3)\r\n        intFilterSize = min(input2.size(1), input3.size(1))\r\n        intOutputHeight = min(input2.size(2), input3.size(2))\r\n        intOutputWidth = min(input2.size(3), input3.size(3))\r\n\r\n        assert(intInputHeight - self.filtersize == intOutputHeight - 1)\r\n        assert(intInputWidth - self.filtersize == intOutputWidth - 1)\r\n        assert(intFilterSize == self.filtersize)\r\n\r\n        assert(input1.is_contiguous() == True)\r\n        assert(input2.is_contiguous() == True)\r\n        assert(input3.is_contiguous() == True)\r\n\r\n        # output = input1.new().resize_(intBatches, intInputDepth, intOutputHeight, intOutputWidth).zero_()\r\n        flow_ouput = torch.zeros(intBatches, 2,intOutputHeight, intOutputWidth) # as a byproduct of SepConv, but no\r\n\r\n        # assert(input1.is_contiguous())\r\n        # assert(input2.is_contiguous())\r\n        self.input1 = input1.contiguous() # need to use in the backward process, so we need to cache it\r\n        self.input2 = input2.contiguous() # TODO: Note that this is simply a shallow copy?\r\n        self.input3 = input3.contiguous()\r\n        if input1.is_cuda:\r\n            self.device = torch.cuda.current_device()\r\n        else:\r\n            self.device = -1\r\n\r\n        if input1.is_cuda :\r\n            # output = output.cuda()\r\n            flow_ouput = flow_ouput.cuda()\r\n            err = my_lib.SeparableConvFlowLayer_gpu_forward(input1, input2,input3,flow_ouput)\r\n\r\n        else:\r\n            # output = torch.cuda.FloatTensor(input1.data.size())\r\n            err = my_lib.SeparableConvFlowLayer_cpu_forward(input1, input2,input3,flow_ouput)\r\n        if err != 0:\r\n            print(err)\r\n        # the function returns the output to its caller\r\n        return flow_ouput\r\n\r\n    #TODO: if there are multiple outputs of this function, then the order should be well considered?\r\n    def backward(self, gradoutput):\r\n        # print(\"Backward of Interpolation Layer\")\r\n        # gradinput1 = input1.new().zero_()\r\n        # gradinput2 = input2.new().zero_()\r\n        gradinput1 = torch.zeros(self.input1.size()) # the input1 has zero gradient because flow backprop. nothing to gradinput1\r\n        gradinput2 = torch.zeros(self.input2.size())\r\n        gradinput3 = torch.zeros(self.input3.size())\r\n        if self.input1.is_cuda:\r\n            # print(\"CUDA backward\")\r\n            gradinput1 = gradinput1.cuda(self.device)\r\n            gradinput2 = gradinput2.cuda(self.device)\r\n            gradinput3 = gradinput3.cuda(self.device)\r\n\r\n            # the input1 image should not require any gradients\r\n            # print(\"Does input1 requires gradients? \" + str(self.input1.requires_grad))\r\n\r\n            # err = my_lib.SeparableConvFlowLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)\r\n            err = my_lib.SeparableConvFlowLayer_gpu_backward(self.input1,self.input2,self.input3, gradoutput,gradinput1,gradinput2,gradinput3)\r\n            if err != 0 :\r\n                print(err)\r\n\r\n        else:\r\n            # print(\"CPU backward\")\r\n            # print(gradoutput)\r\n            # print(err)\r\n            # err = my_lib.SeparableConvFlowLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)\r\n            err = my_lib.SeparableConvFlowLayer_cpu_backward(self.input1, self.input2, self.input3, gradoutput, gradinput1, gradinput2, gradinput3)\r\n\r\n            if err != 0 :\r\n                print(err)\r\n            # print(gradinput1)\r\n            # print(gradinput2)\r\n\r\n        # print(gradinput1)\r\n\r\n        return gradinput1, gradinput2,gradinput3"
  },
  {
    "path": "my_package/SeparableConvFlow/SeparableConvFlowModule.py",
    "content": "# modules/InterpolationLayer.py\r\nfrom torch.nn import Module\r\nfrom .SeparableConvFlowLayer import SeparableConvFlowLayer\r\nimport  torch\r\nclass SeparableConvFlowModule(Module):\r\n    def __init__(self,filtersize):\r\n        super(SeparableConvFlowModule, self).__init__()\r\n        self.f = SeparableConvFlowLayer(filtersize)\r\n\r\n    def forward(self, input1, input2, input3):\r\n        # temp2 = torch.div(input2, torch.sum(input2,dim=1,keepdim=True))\r\n        return self.f(input1, input2, input3)\r\n\r\n    #we actually dont need to write the backward code for a module, since we have \r\n\r\n"
  },
  {
    "path": "my_package/SeparableConvFlow/__init__.py",
    "content": "from  .SeparableConvFlowModule import *\n"
  },
  {
    "path": "my_package/SeparableConvFlow/separableconvflow_cuda.cc",
    "content": "#include <torch/torch.h>\r\n#include <ATen/ATen.h>\r\n#include <stdio.h>\r\n#include <iostream>\r\n#include <ATen/cuda/CUDAContext.h> //works for 1.0.0\r\n\r\n#include \"separableconvflow_cuda_kernel.cuh\"\r\n\r\nint SeparableConvFlowLayer_gpu_forward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  input3,\r\n\t\t//at::Tensor&  output,\r\n\t\tat::Tensor&  flow_output\r\n\r\n\t\t)\r\n\t\t{\r\n\tint error = 1 ;\r\n    //int point  =0 ;printf(\"debug point  %d\\n\", point++ );\r\n\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size(0) != batch) return error;\r\n\tif(input2.size(1) != input2.size(1)) return error;\r\n    //printf(\"debug point  %d\\n\", point++ );\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w - input2.size(1) + 1) return error;\r\n\t\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n    int input3_b_stride = input3.stride(0);\r\n\tint input3_c_stride = input3.stride(1);\r\n\tint input3_h_stride = input3.stride(2);\r\n\tint input3_w_stride = input3.stride(3);\r\n\r\n    //int output_b_stride = output.stride(0);\r\n\t//int output_c_stride = output.stride(1);\r\n\t//int output_h_stride = output.stride(2);\r\n\t//int output_w_stride = output.stride(3);\r\n\t\r\n    int flow_output_b_stride = flow_output.stride(0);\r\n\tint flow_output_c_stride = flow_output.stride(1);\r\n\tint flow_output_h_stride = flow_output.stride(2);\r\n\tint flow_output_w_stride = flow_output.stride(3);\t\r\n    //printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n    if(input1_w_stride !=1) return error;\r\n\tif(input2_w_stride !=1) return error;\r\n    if(input3_w_stride !=1) return error;\r\n   // if(output_w_stride !=1) return error;\r\n\tif(flow_output_w_stride !=1) return error;\r\n\r\n\r\n\tif(input2_b_stride != input3_b_stride) return error;\r\n\tif(input2_c_stride != input3_c_stride) return error;\r\n    //printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\tint\tnElement = 0;//UNUSED  0;//UNUSED  THCudaTensor_nElement(state, flow_output);\r\n\r\n\r\n\terror = SeparableConvFlowLayer_gpu_forward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement,w,h,channel,batch,  input2.size(1),\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t//\toutput_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\t\t\tflow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,\r\n\r\n\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tinput3,\r\n\t\t\t//output ,\r\n\t\t\tflow_output \r\n\t\t\t\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n    //printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\treturn error;\r\n\r\n\t\t}\r\nint SeparableConvFlowLayer_gpu_backward(\r\n\t\tat::Tensor&  input1,\r\n\t\tat::Tensor&  input2,\r\n\t\tat::Tensor&  input3,\r\n\t\tat::Tensor&  gradflow_output,\r\n\t\tat::Tensor&  gradinput1,\r\n\t\tat::Tensor&  gradinput2,\r\n\t\tat::Tensor&  gradinput3\r\n\t\t)\r\n\t\t{\r\n\r\n\r\n    int error = 1 ;\r\n\tint channel = input1.size( 1);\r\n\tif(channel!=3) return error;\r\n\tint batch = input1.size(0);\r\n\tif(input2.size( 0) != batch) return error;\r\n\tif(input2.size(1) != input2.size(1)) return error;\r\n\r\n\tint h = input1.size(2);\r\n\tint w = input1.size(3);\r\n\tif(input2.size(2) != h - input2.size(1) + 1) return error;// to add some checkpoint\r\n\tif(input2.size(3) != w - input2.size(1) + 1) return error;\r\n\r\n\r\n\tint input1_b_stride = input1.stride(0);\r\n\tint input1_c_stride = input1.stride(1);\r\n\tint input1_h_stride = input1.stride(2);\r\n\tint input1_w_stride = input1.stride(3);\r\n\r\n\tint input2_b_stride = input2.stride(0);\r\n\tint input2_c_stride = input2.stride(1);\r\n\tint input2_h_stride = input2.stride(2);\r\n\tint input2_w_stride = input2.stride(3);\r\n\r\n    int input3_b_stride = input3.stride(0);\r\n\tint input3_c_stride = input3.stride(1);\r\n\tint input3_h_stride = input3.stride(2);\r\n\tint input3_w_stride = input3.stride(3);\r\n\r\n    //int output_b_stride = gradoutput.stride(0);\r\n\t//int output_c_stride = gradoutput.stride(1);\r\n\t//int output_h_stride = gradoutput.stride(2);\r\n\t//int output_w_stride = gradoutput.stride(3);\r\n\t\r\n    int flow_output_b_stride = gradflow_output.stride(0);\r\n\tint flow_output_c_stride = gradflow_output.stride(1);\r\n\tint flow_output_h_stride = gradflow_output.stride(2);\r\n\tint flow_output_w_stride = gradflow_output.stride(3);\t\t\r\n\r\n//    printf(\"filter tensor shape: %d,%d,%d,%d\\n\", input3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride);\r\n\r\n\r\n\t//TODO: do we need to assert the w_stride to be 1\r\n\tif(input1_w_stride !=1) return error;\r\n\tif(input2_w_stride !=1) return error;\r\n    if(input3_w_stride !=1) return error;\r\n  //  if(output_w_stride !=1) return error;\r\n\tif(flow_output_w_stride !=1) return error;\r\n\r\n    if(input1_b_stride != gradinput1.stride(0)) return error;\r\n\tif(input2_b_stride != gradinput2.stride(0)) return error;\r\n\tif(input1_c_stride != gradinput1.stride(1)) return error;\r\n\tif(input2_c_stride != gradinput2.stride(1)) return error;\r\n\tif(input3_c_stride != gradinput3.stride(1)) return error;\r\n\r\n//    printf(\"GPU backward: %d,%d,%d,%d\\n\", input1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride);\r\n\r\n\tint\tnElement = 0;//UNUSED  0;//UNUSED  THCudaTensor_nElement(state, gradflow_output);\r\n\r\n\terror  = SeparableConvFlowLayer_gpu_backward_kernel(\r\n//\t\t\tat::globalContext().getCurrentCUDAStream(), //works for 0.4.1\r\n           at::cuda::getCurrentCUDAStream(), //works for 1.0.0\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel,batch,  input2.size(1),\r\n\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t//\toutput_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\t\t\tflow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,\r\n\r\n\t\t\tinput1,\r\n\t\t\tinput2,\r\n\t\t\tinput3,\r\n\t\t\tgradflow_output,\r\n\t\t\tgradinput1,\r\n\t\t\tgradinput2,\r\n\t\t\tgradinput3\r\n\t\t\t);\r\n\t  if (error) {AT_ERROR(\"CUDA call failed\");}\r\n\r\n\treturn error;\r\n}\r\n\r\n\r\n\r\n\r\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\r\n  m.def(\"SeparableConvFlowLayer_gpu_forward\", &SeparableConvFlowLayer_gpu_forward, \"SeparableConvFlow forward (CUDA)\");\r\n  m.def(\"SeparableConvFlowLayer_gpu_backward\", &SeparableConvFlowLayer_gpu_backward, \"SeparableConvFlow backward (CUDA)\");\r\n}\r\n"
  },
  {
    "path": "my_package/SeparableConvFlow/separableconvflow_cuda_kernel.cu",
    "content": "#include <stdio.h>\r\n\r\n#include \"separableconvflow_cuda_kernel.cuh\"\r\n\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/NativeFunctions.h>\r\n#include <ATen/Dispatch.h>\r\n#include <ATen/cuda/CUDAApplyUtils.cuh>\r\n\r\n\r\n#define min(a,b) ((a<b)?(a):(b))\r\n#define max(a,b) ((a>b)?(a):(b))\r\n\r\n#define DEBUG (0)\r\n#ifndef BLOCKDIMX\r\n#define BLOCKDIMX (32)\r\n#endif\r\n#ifndef BLOCKDIMY\r\n#define BLOCKDIMY (16)\r\n#endif\r\nusing at::Half;\r\n\r\n\r\n\r\n\r\n//forward path of our layer\r\ntemplate <typename scalar_t>\r\n__global__ void SeparableConvFlowLayer_gpu_forward_kernelfunc(\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\t//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\t\tconst int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__   input1,    \t\tconst scalar_t* __restrict__   input2,    \tconst scalar_t* __restrict__   input3, \t scalar_t*  flow_output\r\n\r\n\t\t)\r\n{\r\n\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\t//only use one dimensioon of the grid and block\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w - filter_size + 1;\r\n\tconst bool withinYbounds = h_i < h - filter_size + 1;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\r\n\r\n\t//    __syncthreads();\r\n//\tconst float fillvalue =0.0f;\r\n\r\n\tif( withinXbounds && withinYbounds) {\r\n \r\n\t\tfloat flow_y = 0.0f;\r\n\t\tfloat sum_weights = 0.0f;\r\n\t\tfor (  int intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {\r\n\t\t\tfloat temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];\r\n\t\t\tflow_y += (float)(intFilterY) * temp2 ;\r\n\t\t\tsum_weights += \t\t\ttemp2;\r\n\t\t}\r\n\t\t//sum_weights = fabs(sum_weights);\r\n\t\tflow_y = flow_y / sum_weights - ((float)(filter_size)-1.0)/2.0;\r\n\t\tflow_output[batch_i * flow_output_b_stride + 1 * flow_output_c_stride+ h_i* flow_output_h_stride + w_i] = \r\n\t\t\t\t\tfabs(sum_weights) > 0.0f ?  flow_y : -2000;\r\n\r\n\t\tfloat flow_x = 0.0f;\r\n\t\tfloat sum_weights_x = 0.0f;\r\n\t\tfor (   int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {\r\n\t\t\tfloat temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];\r\n\t\t\tflow_x += (float)(intFilterX)  * temp3;\r\n\t\t\tsum_weights_x += \t\t temp3;\r\n\t\t}\r\n\t\t//sum_weights_x = fabs(sum_weights_x);\r\n\t\tflow_x = flow_x / sum_weights_x - ((float)(filter_size)-1.0)/2.0;\r\n\t\t// what if the sum_weight is less than zeros.\r\n\t\tflow_output[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i* flow_output_h_stride + w_i] =\r\n\t\t\t\t\tfabs(sum_weights_x) >0.0f ? flow_x : -2000;\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\ntemplate <typename scalar_t>\r\n__global__ void SeparableConvFlowLayer_gpu_backward_kernelfunc(\r\n\t\tconst int nElement, \t   const int w, \t\tconst int h, \t\tconst int channel, const int filter_size,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\t//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\t\tconst int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,\r\n\r\n\t\tconst scalar_t* __restrict__      input1,        \t\tconst scalar_t* __restrict__    input2,\t\tconst scalar_t* __restrict__      input3,\r\n\t\tconst scalar_t* __restrict__      gradflow_output,    \t\tscalar_t*  gradinput1,  \t\tscalar_t*  gradinput2,  \t\tscalar_t*  gradinput3\r\n\t\t)\r\n\t\t{\r\n\t//blockIdx.z : batch index from 0~B-1\r\n\t//blockIdx.y : height patch index from ceil(h/16)\r\n\t//blockIdx.x : width patch index from ceil(w/32)\r\n\r\n\t//threadidx.x: width index 0~31\r\n\t//threadIdx.y: height index 0~15\r\n\t//threadIdx.z: Not used\r\n\r\n\tconst int w_i = blockIdx.x * blockDim.x + threadIdx.x;\r\n\tconst int h_i = blockIdx.y * blockDim.y + threadIdx.y;\r\n\tconst bool withinXbounds = w_i < w - filter_size + 1;\r\n\tconst bool withinYbounds = h_i < h - filter_size + 1;\r\n\r\n\tconst int batch_i = blockIdx.z;\r\n\r\n\tif(withinXbounds && withinYbounds){\r\n\t\tfloat flow_y = 0.0f;\r\n\t\tfloat sum_weights = 0.0f;\r\n\t\t\r\n\t\tfor ( int  intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {\r\n\t\t\tfloat temp2 = input2[batch_i * input2_b_stride + intFilterY * input2_c_stride + h_i * input2_h_stride + w_i ];\r\n\t\t\tflow_y += (float)(intFilterY) * temp2 ;\r\n\t\t\tsum_weights += \t\t\ttemp2;\r\n\t\t}\r\n\t\t//flow_y = flow_y / sum_weights - ((float)(filter_size)-1.0)/2.0;\r\n\t\t//flow_output_data[batch_i * flow_output_b_stride + 1 * flow_output_c_stride+ h_i* flow_output_h_stride + w_i] = \r\n\t\t//\t\tsum_weights >0.0f ?  flow_y : -2000;\r\n\t\t//float sign = sum_weights >0.0f ? 1.0f : -1.0f;\r\n\t\t//sum_weights = fabs(sum_weights);\r\n\t\tif(fabs(sum_weights) >0.0f ){\r\n\t\t\tfloat gradflow_y = gradflow_output[batch_i * flow_output_b_stride + 1* flow_output_c_stride + \r\n\t\t\t\t\t\t\t\th_i * flow_output_h_stride + w_i ] ;\t\t\t\t\t\r\n\t\t\tfloat offset = flow_y / ( sum_weights * sum_weights);\r\n\t\t\tfor (int  intFilterY = 0; intFilterY < filter_size; intFilterY += 1) {\r\n\t\t\t\tgradinput2[batch_i * input2_b_stride + intFilterY * input2_c_stride  +  h_i * input2_h_stride + w_i ] =\r\n\t\t\t\t\t\t\tgradflow_y *  ((float)(intFilterY) / sum_weights -  offset);\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\t\r\n\t\t\r\n\t\tfloat flow_x = 0.0f;\r\n\t\tfloat sum_weights_x = 0.0f;\r\n\t\tfor ( int  intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {\r\n\t\t\tfloat temp3 = input3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ];\r\n\t\t\tflow_x += (float)(intFilterX)  * temp3;\r\n\t\t\tsum_weights_x += \t\t temp3;\r\n\t\t}\r\n\t\t//flow_x = flow_x / sum_weights_x - ((float)(filter_size)-1.0)/2.0;\r\n\t\t//flow_output_data[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + h_i* flow_output_h_stride + w_i] =\r\n\t\t//\t\t\tsum_weights_x >0 ? flow_x : -2000;\r\n\t\t//float sign_x = sum_weights_x >0.0f ? 1.0f : -1.0f;\r\n\t\t//sum_weights_x = fabs(sum_weights_x);\t\r\n\t\tif(fabs(sum_weights_x) > 0.0f ){\r\n\t\t\t float gradflow_x = gradflow_output[batch_i * flow_output_b_stride + 0 * flow_output_c_stride + \r\n\t\t\t\t\t\t\t\t\th_i * flow_output_h_stride + w_i];\r\n\t\t\tfloat offset  = flow_x / (sum_weights_x * sum_weights_x);\r\n\t\t\tfor ( int intFilterX = 0; intFilterX < filter_size; intFilterX += 1) {\r\n\t\t\t\tgradinput3[batch_i * input3_b_stride + intFilterX * input3_c_stride + h_i * input3_h_stride + w_i ] +=\r\n\t\t\t\t\t\tgradflow_x * ((float)(intFilterX) /sum_weights_x - offset);\r\n\t\t\t}\r\n\t\t}\r\n\t}\r\n\treturn ;\r\n\r\n}\r\n\r\n\r\nint SeparableConvFlowLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch,const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\t//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\t\tconst int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,\r\n\r\n\t\tat::Tensor&  input1,    \t\tat::Tensor&  input2,    \tat::Tensor&  input3,   at::Tensor&  flow_output\r\n\r\n\t\t)\r\n{\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//\t\tblockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w  - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1 + BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\t//extract the data of CudaTensor and use kernel to calculate.\r\n\t\t\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\nSeparableConvFlowLayer_gpu_forward_kernelfunc<<<grid,block,0, stream >>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel, filter_size,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t\t//output_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\t\t\tflow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,\r\n\r\n\t\t\tinput1.data<scalar_t>(),input2.data<scalar_t>(),input3.data<scalar_t>(), flow_output.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\t//\t\t\tTHCudaCheck(cudaGetLastError());\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in SeparableConvFlowLayer_gpu_forward_kernel: %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\nint SeparableConvFlowLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,    \t\tconst int h,    \t\tconst int channel,  \t\tconst int batch, const int filter_size,\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t\t//const int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\t\tconst int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,\r\n\r\n\t\tat::Tensor&  input1,        \t\tat::Tensor&  input2,\t\tat::Tensor&  input3,\r\n\r\n\t\tat::Tensor&  gradflow_output,    \t\tat::Tensor&  gradinput1,  \t\tat::Tensor&  gradinput2,  \t\tat::Tensor&  gradinput3\r\n\t\t)\r\n{\r\n\r\n\tint error = 1 ;\r\n\r\n\tdim3 grid;\r\n\tdim3 block;\r\n\r\n\r\n\t//blockthread = 128;\r\n\t//the threadIdx.x is sheduled first, then threadIdx.y, threadIdx.z\r\n\t//the three channels are processsed in one kernel\r\n\tblock  = dim3(BLOCKDIMX,BLOCKDIMY,1);\r\n\tgrid = dim3( (w - filter_size + 1 + BLOCKDIMX - 1)/ BLOCKDIMX, (h  - filter_size + 1+ BLOCKDIMY - 1) / BLOCKDIMY, batch);\r\n    if(BLOCKDIMX != 32 || BLOCKDIMY != 16||DEBUG)\r\n        printf(\"BLOCKDIMX revised to %d, BLOCKDIMY revised to %d \\n\", BLOCKDIMX,BLOCKDIMY);\r\n\r\n//    cudaMemset((void*)gradinput1, 0, input1_b_stride * batch * sizeof(float));\r\n//    cudaMemset((void*)gradinput2, 0, input2_b_stride * batch * sizeof(float));\r\n//    cudaMemset((void*)gradinput3, 0, input3_b_stride * batch * sizeof(float));\r\n\t\t\tAT_DISPATCH_FLOATING_TYPES(input1.type(), \"DepthFlowProjection_gpu_backward\", ([&] {\r\n\r\n\tSeparableConvFlowLayer_gpu_backward_kernelfunc <<<grid,block,0, stream>>>(\r\n\t\t\tnElement, //to let the nummous\r\n\t\t\tw,h,channel, filter_size,\r\n\t\t\tinput1_b_stride,input1_c_stride,input1_h_stride,input1_w_stride,\r\n\t\t\tinput2_b_stride,input2_c_stride,input2_h_stride,input2_w_stride,\r\n\t\t\tinput3_b_stride,input3_c_stride,input3_h_stride,input3_w_stride,\r\n\t\t\t//output_b_stride,output_c_stride,output_h_stride,output_w_stride,\r\n\t\t\tflow_output_b_stride,flow_output_c_stride,flow_output_h_stride,flow_output_w_stride,\r\n\r\n\r\n\t\t\tinput1.data<scalar_t>(), \t\t\tinput2.data<scalar_t>(),         input3.data<scalar_t>(),  \t\t\tgradflow_output.data<scalar_t>(),\r\n\t\t\tgradinput1.data<scalar_t>(), \t\t\tgradinput2.data<scalar_t>(),     gradinput3.data<scalar_t>()\r\n\t\t\t);\r\n \t\t\t\t\t}));\r\n\r\n\tcudaError_t err = cudaGetLastError();\r\n\r\n\tif (err != cudaSuccess) {\r\n\t\tprintf(\"gpuerror in BilinearSampler.updateGradInput %s\\n\", cudaGetErrorString(err));\r\n\t\t//THError(\"aborting\");\r\n\t\treturn error;\r\n\t}\r\n\r\n\terror = 0;\r\n\treturn error;\r\n\r\n}\r\n\r\n\r\n\r\n\r\n \r\n"
  },
  {
    "path": "my_package/SeparableConvFlow/separableconvflow_cuda_kernel.cuh",
    "content": "#pragma once\r\n\r\n#include <ATen/ATen.h>\r\n#include <ATen/Context.h>\r\n#include <cuda_runtime.h>\r\n\r\nint SeparableConvFlowLayer_gpu_forward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w, \t\tconst int h, \t\tconst int channel, \t\tconst int batch, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t//\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\t\tconst int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,\r\n\r\n\t\tat::Tensor& input1,    \t\tat::Tensor& input2,    \tat::Tensor& input3, \t at::Tensor& flow_output\r\n\r\n\t\t);\r\n\r\nint SeparableConvFlowLayer_gpu_backward_kernel(\r\n\t\tcudaStream_t stream,\r\n\t\tconst int nElement,\r\n\t\tconst int w,    \t\tconst int h,    \t\tconst int channel,  \t\tconst int batch, const int filter_size,\r\n\r\n\t\tconst int input1_b_stride, const int input1_c_stride, const int input1_h_stride, const int input1_w_stride,\r\n\t\tconst int input2_b_stride, const int input2_c_stride, const int input2_h_stride, const int input2_w_stride,\r\n\t\tconst int input3_b_stride, const int input3_c_stride, const int input3_h_stride, const int input3_w_stride,\r\n\t//\tconst int output_b_stride, const int output_c_stride, const int output_h_stride, const int output_w_stride,\r\n\t\tconst int flow_output_b_stride, const int flow_output_c_stride, const int flow_output_h_stride, const int flow_output_w_stride,\r\n\r\n\t\tat::Tensor& input1,        \t\tat::Tensor& input2,\t\tat::Tensor& input3,\r\n\r\n\t\tat::Tensor& gradflow_output,    \t\tat::Tensor& gradinput1,  \t\tat::Tensor& gradinput2,  \t\tat::Tensor& gradinput3\r\n\t\t);\r\n\t\t\r\n\r\n"
  },
  {
    "path": "my_package/SeparableConvFlow/setup.py",
    "content": "#!/usr/bin/env python3\nimport os\nimport torch\n\nfrom setuptools import setup, find_packages\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExtension\n\nfrom compiler_args import nvcc_args, cxx_args\n\nsetup(\n    name='separableconvflow_cuda',\n    ext_modules=[\n        CUDAExtension('separableconvflow_cuda', [\n            'separableconvflow_cuda.cc',\n            'separableconvflow_cuda_kernel.cu'\n        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})\n    ],\n    cmdclass={\n        'build_ext': BuildExtension\n    })\n"
  },
  {
    "path": "my_package/build.sh",
    "content": "#!/usr/bin/env bash\n\necho \"Need pytorch>=1.0.0\"\nsource activate pytorch1.0.0\n\nexport PYTHONPATH=$PYTHONPATH:$(pwd)\n\ncd MinDepthFlowProjection\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd FlowProjection\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd SeparableConv\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd InterpolationCh\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd DepthFlowProjection\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd Interpolation\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd SeparableConvFlow\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\ncd FilterInterpolation\nrm -rf build *.egg-info dist\npython setup.py install\ncd ..\n\n"
  },
  {
    "path": "my_package/clean.sh",
    "content": "#!/usr/bin/env bash\n\necho \"Need pytorch>=1.0.0\"\nsource activate pytorch1.0.0\n\ncd MinDepthFlowProjection\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd FlowProjection\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd SeparableConv\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd InterpolationCh\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd DepthFlowProjection\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd Interpolation\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd SeparableConvFlow\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\ncd FilterInterpolation\nrm -rf build *.egg-info dist\n#python setup.py install\ncd ..\n\n"
  },
  {
    "path": "my_package/compiler_args.py",
    "content": "# References: https://developer.nvidia.com/cuda-gpus\r\nnvcc_args = [\r\n    # Tesla: K80, K80\r\n    # Quadro: (None)\r\n    # NVIDIA NVS: (None)\r\n    # Jetson: (None)\r\n    '-gencode', 'arch=compute_37,code=sm_37',\r\n\r\n    # Tesla: (None)\r\n    # Quadro: K1200, K620, M1200, M520, M5000M, M4000M, M3000M, M2000M, M1000M, K620M, M600M, M500M\r\n    # NVIDIA NVS: 810\r\n    # GeForce / Titan: GTX 750 Ti, GTX 750, GTX 960M, GTX 950M, 940M, 930M, GTX 860M, GTX 850M, 840M, 830M\r\n    # Jetson: (None)\r\n    '-gencode', 'arch=compute_50,code=sm_50',\r\n\r\n    # Tesla: M60, M40\r\n    # Quadro: M6000 24GB, M6000, M5000, M4000, M2000, M5500M, M2200, M620\r\n    # NVIDIA NVS: (None)\r\n    # GeForce / Titan: GTX TITAN X, GTX 980 Ti, GTX 980, GTX 970, GTX 960, GTX 950, GTX 980, GTX 980M, GTX 970M, GTX 965M, 910M\r\n    # Jetson: (None)\r\n    '-gencode', 'arch=compute_52,code=sm_52',\r\n\r\n    # Tesla: P100\r\n    # Quadro: GP100\r\n    # NVIDIA: NVS: (None)\r\n    # GeForce / Titan: (None)\r\n    # Jetson: (None)\r\n    '-gencode', 'arch=compute_60,code=sm_60',\r\n\r\n    # Tesla: P40, P4\r\n    # Quadro: P6000, P5000, P4000, P2200, P2000, P1000, P620, P600, P400, P620, P520, P5200, P4200, P3200, P5000, P4000, P3000, P2000, P1000, P600, P500\r\n    # NVIDIA NVS: (None)\r\n    # GeForce / Titan: TITAN Xp, TITAN X, GTX 1080 Ti, GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1080, GTX 1070, GTX 1060\r\n    # Jetson: (None)\r\n    '-gencode', 'arch=compute_61,code=sm_61',\r\n\r\n    # Tesla: T4\r\n    # Quadro: RTX 8000, RTX 6000, RTX 5000, RTX 4000, RTX 5000, RTX 4000, RTX 3000, T2000, T1000\r\n    # NVIDIA NVS: (None)\r\n    # GeForce / Titan: TITAN RTX, RTX 2080 Ti, RTX 2080, RTX 2070, RTX 2060, RTX 2080, RTX 2070, RTX 2060\r\n    # Jetson: (None)\r\n    '-gencode', 'arch=compute_75,code=sm_75',\r\n\r\n    # '-gencode', 'arch=compute_70,code=sm_70',\r\n    # '-gencode', 'arch=compute_70,code=compute_70'\r\n\r\n    '-w' # Ignore compiler warnings.\r\n]\r\n\r\ncxx_args = ['-std=c++11', '-w']"
  },
  {
    "path": "my_package/test_module.py",
    "content": "# main.py\r\nimport torch\r\nimport torch.nn as nn\r\nfrom torch.autograd import Variable\r\nfrom torch.autograd import gradcheck\r\n\r\n#from modules.InterpolationModule import InterpolationModule\r\n#from modules.FilterInterpolationModule import FilterInterpolationModule\r\n#from modules.FlowProjectionModule import FlowProjectionModule\r\nfrom my_package.DepthFlowProjection import DepthFlowProjectionModule\r\n\r\n#from modules.FilterInterpolationModule import AdaptiveWeightInterpolationModule\r\n#from modules.SeparableConvModule import SeparableConvModule\r\nimport time\r\nimport numpy\r\n#from modules.InterpolationChModule import InterpolationChModule\r\n#from modules.WeigtedFlowProjectionModule import WeightedFlowProjectionModule\r\n#from modules.SeparableConvFlowModule import SeparableConvFlowModule\r\n\r\ndef test_SeparableConvFlowModule(input1, input2, input3,filtersize):\r\n    FilterInterpolate = SeparableConvFlowModule(filtersize)\r\n\r\n    t1 = time.time()\r\n\r\n    output = FilterInterpolate(input1, input2, input3)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    t1 = time.time()\r\n    FilterInterpolate.zero_grad()  # to clean up the gradient in the last backward\r\n\r\n    output_cuda = FilterInterpolate(input1_cuda, input2_cuda, input3_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n    print(\"Check the forward path between CPU and GPU...\", end='\\t')\r\n    x = (output_cuda - output.cuda()) *2 / (torch.abs(output_cuda) + torch.abs(output).cuda())\r\n    x = torch.max(torch.abs(x))\r\n    # print(x)\r\n\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(output_cuda - output.cuda()))\r\n    else:\r\n        print(\"output pass\", end='\\n')\r\n\r\n    # x = (flow_cuda - flow.cuda() ) * 2 / (torch.abs(flow_cuda) + torch.abs(flow).cuda() )\r\n    # x = torch.max(torch.abs(x))\r\n    # # print(x)\r\n    #\r\n    # if (x.cpu().data.numpy()[0] > 1e-6):\r\n    #     print(x)\r\n    # else:\r\n    #     print(\"flow pass\", end='\\n')\r\n    #\r\n    print(\"Check the backward path between CPU and GPU...\", end='\\t')\r\n    # x = (input1_cuda.grad - input1.grad.cuda()) * 2 /(torch.abs(input1_cuda.grad) + torch.abs(input1.grad).cuda())\r\n    # # y = x.cpu().data.numpy()\r\n    # x = torch.max(torch.abs(x))\r\n    # # print(x)\r\n    #\r\n    # if (x.cpu().data.numpy()[0] > 1e-6):\r\n    #     print(x)\r\n    #     print(torch.mean(input1_cuda.grad - input1.grad.cuda()))\r\n    # else:\r\n    #     print(\"pass\", end='\\t')\r\n\r\n    x = (input2_cuda.grad - input2.grad.cuda()) * 2 /(torch.abs(input2_cuda.grad) + torch.abs(input2.grad).cuda())\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = (input3_cuda.grad - input3.grad.cuda()) * 2 / (torch.abs(input3_cuda.grad) + torch.abs(input3.grad).cuda())\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n    return t2 - t1, t3 - t2\r\n\r\ndef test_SeparableConvModule(input1, input2, input3,filtersize):\r\n    FilterInterpolate = SeparableConvModule(filtersize)\r\n\r\n    t1 = time.time()\r\n\r\n    output = FilterInterpolate(input1, input2, input3)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    t1 = time.time()\r\n    FilterInterpolate.zero_grad()  # to clean up the gradient in the last backward\r\n\r\n    output_cuda = FilterInterpolate(input1_cuda, input2_cuda, input3_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n    print(\"Check the forward path between CPU and GPU...\", end='\\t')\r\n    x = (output_cuda - output.cuda()) *2 / (torch.abs(output_cuda) + torch.abs(output).cuda())\r\n    x = torch.max(torch.abs(x))\r\n    # print(x)\r\n\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    print(\"Check the backward path between CPU and GPU...\", end='\\t')\r\n    x = (input1_cuda.grad - input1.grad.cuda()) * 2 /(torch.abs(input1_cuda.grad) + torch.abs(input1.grad).cuda())\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    # print(x)\r\n\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input1_cuda.grad - input1.grad.cuda()))\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = (input2_cuda.grad - input2.grad.cuda()) * 2 /(torch.abs(input2_cuda.grad) + torch.abs(input2.grad).cuda())\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = (input3_cuda.grad - input3.grad.cuda()) * 2 / (torch.abs(input3_cuda.grad) + torch.abs(input3.grad).cuda())\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n    return t2 - t1, t3 - t2\r\n\r\n\r\ndef test_FilterInterpolation(input1,input2,input3):\r\n    FilterInterpolate = FilterInterpolationModule()\r\n\r\n    t1 = time.time()\r\n\r\n    output = FilterInterpolate(input1, input2, input3)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    t1 = time.time()\r\n    FilterInterpolate.zero_grad()# to clean up the gradient in the last backward\r\n\r\n    output_cuda = FilterInterpolate(input1_cuda, input2_cuda ,input3_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n\r\n    print(\"Check the forward path between CPU and GPU...\", end='\\t')\r\n    x = output_cuda - output.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    # print(x)\r\n\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    print(\"Check the backward path between CPU and GPU...\", end='\\t')\r\n    x = input1_cuda.grad - input1.grad.cuda()\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    # print(x)\r\n\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input1_cuda.grad - input1.grad.cuda()))\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = input2_cuda.grad - input2.grad.cuda()\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = input3_cuda.grad - input3.grad.cuda()\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n    return t2-t1,t3-t2\r\n\r\n\r\ndef test_InterpolationModule(input1,input2):\r\n    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))\r\n    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))\r\n    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)\r\n    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)\r\n    # input2 = Variable(torch.zeros(12,2,64,64))\r\n    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))\r\n    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))\r\n    # input1.data.uniform_()\r\n    # input2.data.uniform_(-5,5)\r\n\r\n    Interpolate = InterpolationModule()\r\n\r\n    t1 = time.time()\r\n\r\n    output = Interpolate(input1,input2)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    t1 = time.time()\r\n    output_cuda = Interpolate(input1_cuda,input2_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n\r\n    print(\"Check the forward path between CPU and GPU...\",end='\\t')\r\n    x = output_cuda - output.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n    print(\"Check the backward path between CPU and GPU...\",end='\\t')\r\n    x = input1_cuda.grad - input1.grad.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\t')\r\n    x = input2_cuda.grad - input2.grad.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n    return t2-t1,t3-t2\r\n\r\ndef test_InterpolationChModule(input1,input2):\r\n    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))\r\n    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))\r\n    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)\r\n    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)\r\n    # input2 = Variable(torch.zeros(12,2,64,64))\r\n    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))\r\n    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))\r\n    # input1.data.uniform_()\r\n    # input2.data.uniform_(-5,5)\r\n\r\n    Interpolate = InterpolationChModule(input1.size(1))\r\n\r\n    t1 = time.time()\r\n\r\n    output = Interpolate(input1,input2)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    t1 = time.time()\r\n    output_cuda = Interpolate(input1_cuda,input2_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n\r\n    print(\"Check the forward path between CPU and GPU...\",end='\\t')\r\n    x = output_cuda - output.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n    print(\"Check the backward path between CPU and GPU...\",end='\\t')\r\n    x = input1_cuda.grad - input1.grad.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\t')\r\n    x = input2_cuda.grad - input2.grad.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n    return t2-t1,t3-t2\r\n\r\ndef test_FlowProjectionModule(input1):\r\n    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))\r\n    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))\r\n    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)\r\n    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)\r\n    # input2 = Variable(torch.zeros(12,2,64,64))\r\n    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))\r\n    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))\r\n    # input1.data.uniform_()\r\n    # input2.data.uniform_(-5,5)\r\n\r\n    Project = FlowProjectionModule()\r\n\r\n    t1 = time.time()\r\n\r\n    output = Project(input1)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    # input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    Project = FlowProjectionModule() # regnenerate\r\n    t1 = time.time()\r\n    output_cuda = Project(input1_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n\r\n    print(\"Check the forward path between CPU and GPU...\",end='\\t')\r\n    x = output_cuda - output.cuda()\r\n    # print(output_cuda[0, 0, :10, :10])\r\n    # print(output[0, 0, :10, :10])\r\n    # print(x[0, 0, :10, :10])\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n    print(\"Check the backward path between CPU and GPU...\",end='\\t')\r\n    x = input1_cuda.grad - input1.grad.cuda()\r\n    # print(input1_cuda[0,0,:10,:10])\r\n    # print(input1[0,0,:10,:10])\r\n    # print(x[0,0,:10,:10])\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda())))\r\n        print(torch.mean((input1_cuda.grad - input1.grad.cuda())))\r\n    else:\r\n        print(\"pass\",end='\\t')\r\n    # x = input2_cuda.grad - input2.grad.cuda()\r\n    # x = torch.max(torch.abs(x))\r\n    # if(x.cpu().data.numpy()[0] > 1e-6):\r\n    #     print(x)\r\n    # else:\r\n    #     print(\"pass\",end='\\n')\r\n\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n\r\n    print(\"\\n\\n\")\r\n    return t2-t1,t3-t2\r\n\r\ndef test_DepthFlowProjectionModule(input1,input2):\r\n    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))\r\n    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))\r\n    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)\r\n    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)\r\n    # input2 = Variable(torch.zeros(12,2,64,64))\r\n    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))\r\n    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))\r\n    # input1.data.uniform_()\r\n    # input2.data.uniform_(-5,5)\r\n\r\n    # Project = DepthFlowProjectionModule()\r\n\r\n    # t1 = time.time()\r\n\r\n    # output = Project(input1,input2)\r\n    # t2 = time.time()\r\n\r\n    # output.backward(output.data)\r\n    # t3 = time.time()\r\n\r\n\r\n    # print(\"CPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    # temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    Project = DepthFlowProjectionModule(input1_cuda.requires_grad) # regnenerate\r\n    t1 = time.time()\r\n    output_cuda = Project(input1_cuda,input2_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n\r\n    print(\"Check the forward path between CPU and GPU...\",end='\\t')\r\n    x = output_cuda - output.cuda()\r\n    # print(output_cuda[0, 0, :10, :10])\r\n    # print(output[0, 0, :10, :10])\r\n    # print(x[0, 0, :10, :10])\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n    print(\"Check the backward path between CPU and GPU...\",end='\\t')\r\n    x = input1_cuda.grad - input1.grad.cuda()\r\n    # print(input1_cuda[0,0,:10,:10])\r\n    # print(input1[0,0,:10,:10])\r\n    # print(x[0,0,:10,:10])\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda())))\r\n        print(torch.mean((input1_cuda.grad - input1.grad.cuda())))\r\n    else:\r\n        print(\"pass\",end='\\t')\r\n    x = input2_cuda.grad - input2.grad.cuda()\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n\r\n    print(\"\\n\\n\")\r\n    return t2-t1,t3-t2\r\n\r\ndef test_WeightedFlowProjectionModule(input1 , input2, input3):\r\n    # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))\r\n    # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))\r\n    # input1 = Variable(torch.arange(0.0, 12*3*64*256).view(12,3,64,256), requires_grad=True)\r\n    # input2 = Variable(torch.rand(12,2,64,256)*20, requires_grad= True)\r\n    # input2 = Variable(torch.zeros(12,2,64,64))\r\n    # input2 = Variable(torch.ones(12,2,64,64) * (-2.1))\r\n    # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))\r\n    # input1.data.uniform_()\r\n    # input2.data.uniform_(-5,5)\r\n\r\n    # Project = FlowProjectionModule()\r\n    Project = WeightedFlowProjectionModule(threshold=20.0/255.0,requires_grad=True)\r\n\r\n    t1 = time.time()\r\n\r\n    output = Project(input1,input2,input3)\r\n    t2 = time.time()\r\n\r\n    output.backward(output.data)\r\n    t3 = time.time()\r\n\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad = True)\r\n    Project = WeightedFlowProjectionModule(threshold=20.0/255.0, requires_grad=True) # regnenerate\r\n    t1 = time.time()\r\n    output_cuda = Project(input1_cuda,input2_cuda,input3_cuda)\r\n    t2 = time.time()\r\n    output_cuda.backward(output_cuda.data)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2-t1) +\"s\\t\" + str(t3-t2) +\"s\\t\")\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n\r\n    print(\"Check the forward path between CPU and GPU...\",end='\\t')\r\n    x = output_cuda - output.cuda()\r\n    # print(output_cuda[0, 0, :10, :10])\r\n    # print(output[0, 0, :10, :10])\r\n    # print(x[0, 0, :10, :10])\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n    else:\r\n        print(\"pass\",end='\\n')\r\n    print(\"Check the backward path between CPU and GPU...\",end='\\t')\r\n    x = input1_cuda.grad - input1.grad.cuda()\r\n    # print(input1_cuda[0,0,:10,:10])\r\n    # print(input1[0,0,:10,:10])\r\n    # print(x[0,0,:10,:10])\r\n    x = torch.max(torch.abs(x))\r\n    if(x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(torch.abs(input1_cuda.grad - input1.grad.cuda())))\r\n        print(torch.mean((input1_cuda.grad - input1.grad.cuda())))\r\n    else:\r\n        print(\"pass\",end='\\t')\r\n    # x = input2_cuda.grad - input2.grad.cuda()\r\n    # x = torch.max(torch.abs(x))\r\n    # if(x.cpu().data.numpy()[0] > 1e-6):\r\n    #     print(x)\r\n    # else:\r\n    #     print(\"pass\",end='\\n')\r\n\r\n\r\n    # print(x[0,0,...])\r\n    # print(x[0,1,...])\r\n    # print(x[0,2,...])\r\n    #\r\n    # print(torch.max(x))\r\n    # print(x[11,2,...])\r\n\r\n    print(\"\\n\\n\")\r\n    return t2-t1,t3-t2\r\n\r\ndef test_AdaptiveWeightInterpolationModule(input1, input2, input3, input4):\r\n    training = True\r\n    Interpolate = AdaptiveWeightInterpolationModule(training=training)\r\n#gradcheck(Interpolate,)\r\n    t1 = time.time()\r\n\r\n    output = Interpolate(input1, input2, input3, input4)\r\n    t2 = time.time()\r\n\r\n    if training:\r\n        #output.backward(output.data)\r\n        grad = output.data\r\n        # grad = grad.zero_()\r\n        output.backward(grad)\r\n        print(        input3.grad)\r\n    t3 = time.time()\r\n\r\n    print(\"CPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n\r\n    #\r\n    # print(output)\r\n    # print(input1.grad.size())\r\n    # print(input1.grad)\r\n    # print(output[3,0,...])\r\n    temp = input1.grad\r\n\r\n    # input1 = input1.cuda()\r\n    # input2 = input2.cuda()\r\n    # input1_cuda = Variable(torch.arange(0.0, 12*3*64*64).view(12,3,64,64).type(torch.cuda.FloatTensor), requires_grad=True)\r\n    # input2_cuda = Variable((torch.rand(12,2,64,64)*20).type(torch.cuda.FloatTensor), requires_grad= True)\r\n    input1_cuda = Variable(input1.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input2_cuda = Variable(input2.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input3_cuda = Variable(input3.data.type(torch.cuda.FloatTensor), requires_grad=True)\r\n    input4_cuda = Variable(input4.data.type(torch.cuda.FloatTensor), requires_grad=True )\r\n    t1 = time.time()\r\n    Interpolate.zero_grad()  # to clean up the gradient in the last backward\r\n\r\n    output_cuda = Interpolate(input1_cuda, input2_cuda, input3_cuda,input4_cuda)\r\n    t2 = time.time()\r\n    if training :\r\n#        output_cuda.backward(output_cuda.data)\r\n        grad = output_cuda.data\r\n#         grad = grad.zero_()\r\n        output_cuda.backward(grad)\r\n    t3 = time.time()\r\n    print(\"GPU Forward and backward time is : \" + str(t2 - t1) + \"s\\t\" + str(t3 - t2) + \"s\\t\")\r\n    #    return\r\n    # print(output_cuda)\r\n    # print(input1_cuda.grad.size())\r\n    # print(input1_cuda.grad)\r\n\r\n    # print(output_cuda[3,0,...])\r\n    # print(output[3,0,...]- output_cuda[3,0,...].cpu())\r\n\r\n    # print(output_cuda - output.cuda())\r\n    # print(input1_cuda.grad - input1.grad.cuda())\r\n\r\n    print(\"Check the forward path between CPU and GPU...\", end='\\n')\r\n    x = output_cuda - output.cuda()\r\n    #print(x)\r\n    #print(x>1e-6)\r\n    print(\"==>total number of difference\")\r\n    print(torch.sum(torch.abs(x) > 1e-6))\r\n\r\n    x = torch.max(torch.abs(x))\r\n    print(\"==>max difference value is \")\r\n    print(x)\r\n    print(torch.sum(output_cuda > 1) )\r\n    print(torch.sum(output.cuda() > 1))\r\n\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    if not training:\r\n        return t2 - t1, t3 - t2\r\n\r\n    print(\"Check the backward path between CPU and GPU...\", end='\\t')\r\n    y = input1_cuda.grad - input1.grad.cuda()\r\n    x = y.cpu().data.numpy()\r\n    #print(x>1e-6)\r\n    x = torch.max(torch.abs(y))\r\n    print(x)\r\n\r\n\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input1_cuda.grad - input1.grad.cuda()))\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = input2_cuda.grad - input2.grad.cuda()\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input2_cuda.grad - input2.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\t')\r\n    x = input3_cuda.grad - input3.grad.cuda()\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input3_cuda.grad - input3.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    x = input4_cuda.grad - input4.grad.cuda()\r\n    y = x.cpu().data.numpy()\r\n    x = torch.max(torch.abs(x))\r\n    if (x.cpu().data.numpy()[0] > 1e-6):\r\n        print(x)\r\n        print(torch.mean(input4_cuda.grad - input4.grad.cuda()))\r\n\r\n    else:\r\n        print(\"pass\", end='\\n')\r\n\r\n    return t2 - t1, t3 - t2\r\n#\r\n#\r\n# # input1 = Variable(torch.zeros(12,3,64,64).type(torch.FloatTensor))\r\n# # input2 = Variable(torch.rand(12,2,64,64).type(torch.FloatTensor))\r\n# # B,H,W = 1,16,16\r\n# # B,C,H,W = 2,64,32,32\r\n# # filtersize = 4\r\n# # input1 = Variable(torch.arange(0.0, B * C * H * W).view(B, C ,H,W), requires_grad=True)\r\n# # input2 = Variable(torch.rand(B, 2, H, W), requires_grad=True)\r\n# # input3 = Variable(torch.rand(B, filtersize**2, H, W), requires_grad=True)\r\n# #input2 = Variable(torch.arange(1, 1+ B * 3 * H * W).view(B , 3, H, W), requires_grad=True)\r\n# # input3 = Variable(torch.rand(B, 2, H, W), requires_grad=True)\r\n# # input4 = Variable(torch.rand(B, 2, H,W), requires_grad =True)\r\n# B,C,H,W = 1,3,128,128\r\n# filtersize = 51\r\n# input1 = Variable(torch.arange(0.0, B * C * H * W).view(B, C ,H,W), requires_grad=True)\r\n# input2 = Variable(torch.zeros(B,filtersize,H-filtersize+1,W-filtersize+1),requires_grad = True)\r\n# input3 = Variable(torch.ones(B,filtersize,H-filtersize+1,W-filtersize+1),requires_grad = True)\r\n#\r\n# # input1 = Variable(torch.arange(0.0, B * 3 * H * W).view(B, 3,H,W), requires_grad=True)\r\n# # input2 = Variable(torch.arange(1, 1+ B * 3 * H * W).view(B , 3, H, W), requires_grad=True)\r\n# # input3 = Variable(torch.rand(B, 2, H, W), requires_grad=True)\r\n# # input4 = Variable(torch.rand(B, 2, H,W), requires_grad =True)\r\n# # input2 = Variable(torch.zeros(12,2,64,64),requires_grad = True)\r\n# # input3 = Variable(torch.ones(12,16,64,64),requires_grad = True)\r\n# # input2 = Variable(torch.ones(12,///2,64,64) * (-2.1))\r\n# # input2 = Variable(torch.cat((torch.ones(12,1,64,64) *0.251, torch.zeros(12,1,64,64)),dim=1))\r\n# input1.data.uniform_(0, 1)\r\n# input2.data.uniform_(0, 1)\r\n# input3.data.uniform_(0, 1) # not have to be normalized to 1.0\r\n# # input4.data.uniform_(-1,1)\r\n# #\r\n# #\r\n# # ftimes = []\r\n# # btimes = []\r\n# # for i in range(10):\r\n# #     input1.data.uniform_(0, 1)\r\n# #     input2.data.uniform_(-1, 1)\r\n# #     input3.data.uniform_(0,1)\r\n# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule\r\n# #     input2 = Variable(input2.clone().data, requires_grad = True)\r\n# #     input3 = Variable(input3.clone().data, requires_grad = True)\r\n# #     ftime, btime = test_FilterInterpolation(input1,input2,input3)\r\n# #     ftimes.append(ftime)\r\n# #     btimes.append(btime)\r\n# #\r\n# # print(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) +\"s\\t\" + str(numpy.array(btimes).mean()) +\"s\\t\\n\\n\\n\\n\")\r\n# # # nn.LogSoftmax\r\n# # exit(0)\r\n# # ftimes = []\r\n# # btimes = []\r\n# # for i in range(10):\r\n# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule\r\n# #     input2 = Variable(input2.clone().data, requires_grad = True)\r\n# #     ftime, btime = test_InterpolationModule(input1,input2)\r\n# #     ftimes.append(ftime)\r\n# #     btimes.append(btime)\r\n# #\r\n# # print(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) +\"s\\t\" + str(numpy.array(btimes).mean()) +\"s\\t\\n\\n\\n\\n\")\r\n# #\r\n# # ftimes = []\r\n# # btimes = []\r\n# # for i in range(10):\r\n# #     input1.data.uniform_(0, 1)\r\n# #     input2.data.uniform_(-16, 17)\r\n# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule\r\n# #     input2 = Variable(input2.clone().data, requires_grad = True)\r\n# #     ftime, btime = test_InterpolationChModule(input1,input2)\r\n# #     ftimes.append(ftime)\r\n# #     btimes.append(btime)\r\n# #\r\n# # print(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) +\"s\\t\" + str(numpy.array(btimes).mean()) +\"s\\t\\n\\n\\n\\n\")\r\n# # # nn.LogSoftmax\r\n# # exit(0)\r\n# #\r\n# ftimes = []\r\n# btimes = []\r\n# for i in range(3):\r\n#     input1.data.uniform_(0.0, 1)\r\n#     input2.data.uniform_(1.0/filtersize, 1.1/filtersize)\r\n#     input3.data.uniform_(1.0/filtersize, 1.1/filtersize)  # not have to be normalized to 1.0\r\n#\r\n#     input1 = Variable(input1.clone().data, requires_grad=True)  # to delete the graph in InterpolationModule\r\n#     input2 = Variable(input2.clone().data, requires_grad=True)\r\n#     input3 = Variable(input3.clone().data, requires_grad=True)\r\n#     # ftime, btime = test_SeparableConvModule(input1, input2, input3,filtersize)\r\n#     ftime, btime = test_SeparableConvFlowModule(input1, input2, input3,filtersize)\r\n#     ftimes.append(ftime)\r\n#     btimes.append(btime)\r\n# print(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) + \"s\\t\" + str(\r\n#     numpy.array(btimes).mean()) + \"s\\t\")\r\n# exit(0)\r\n#\r\n# #\r\n# # for i in range(10):\r\n# #     input1.data.uniform_(0.14, 0.405)\r\n# #     input2.data.uniform_(0.14, 0.405)\r\n# #     input3.data.uniform_(0.2, 0.501)  # not have to be normalized to 1.0\r\n# #     input4.data.uniform_(0.2, 0.501)\r\n# #\r\n# #     input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule\r\n# #     input2 = Variable(input2.clone().data, requires_grad = True)\r\n# #     input3 = Variable(input3.clone().data, requires_grad = True)\r\n# #     input4 = Variable(input4.clone().data, requires_grad = True)\r\n# #     ftime,btime = test_AdaptiveWeightInterpolationModule(input1,input2,input3,input4)\r\n# #     ftimes.append(ftime)\r\n# #     btimes.append(btime)\r\n# # print(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) +\"s\\t\" + str(numpy.array(btimes).mean()) +\"s\\t\")\r\n#\r\n#\r\n# input1 = Variable(torch.arange(0.0, 12 * 2 * 64 * 64).view(12, 2, 64, 64), requires_grad=True)\r\n# input1.data.uniform_(-1.0,1.0)\r\n# # input1 = Variable( - 0.5 * torch.ones(12,2,64,64).type(torch.FloatTensor), requires_grad = True)\r\n#\r\n#\r\n#\r\n\r\nB,C,H,W = 1,2,512,704\r\ninput1 = Variable(torch.arange(0.0, B*C * H * W).view(B, C, H, W), requires_grad=True)\r\ninput3 = Variable(torch.arange(0.0, B* 3 * H * W).view(B,3, H,W), requires_grad = True)\r\n# input2 = Variable(torch.arange(0.0, B * 3 * H * W).view(B, 3 ,H,W), requires_grad=True)\r\ninput2 = Variable(torch.arange(0.0, B * 1 * H * W).view(B, 1 ,H,W), requires_grad=True)\r\n\r\n\r\n\r\nftimes = []\r\nbtimes = []\r\nfor i in range(10):\r\n    input1.data.uniform_(-1.0, 1.0)\r\n    input2.data.uniform_(0.1, 1.0) # must be larger than zero\r\n    # input3.data.uniform_(0.0, 1.0)\r\n    input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule\r\n    input2 = Variable(input2.clone().data, requires_grad = True)\r\n    # ftime, btime = test_FlowProjectionModule(input1)\r\n    ftime,btime  =test_DepthFlowProjectionModule(input1,input2)\r\n    ftimes.append(ftime)\r\n    btimes.append(btime)\r\n\r\nprint(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) +\"s\\t\" + str(numpy.array(btimes).mean()) +\"s\\t\\n\\n\\n\\n\")\r\n\r\n\r\nexit(0)\r\n\r\n\r\n\r\nftimes = []\r\nbtimes = []\r\nfor i in range(10):\r\n    input1 = Variable(input1.clone().data, requires_grad = True) # to delete the graph in InterpolationModule\r\n\r\n    input2 = Variable(input2.clone().data, requires_grad = True)\r\n    input3 = Variable(input3.clone().data, requires_grad = True)\r\n    ftime, btime = test_WeightedFlowProjectionModule(input1,input2,input3)\r\n    ftimes.append(ftime)\r\n    btimes.append(btime)\r\n\r\nprint(\"GPU Forward and backward time is : \" + str(numpy.array(ftimes).mean()) +\"s\\t\" + str(numpy.array(btimes).mean()) +\"s\\t\\n\\n\\n\\n\")\r\n"
  },
  {
    "path": "networks/DAIN.py",
    "content": "# -*- coding: utf-8 -*-\nimport torch\nimport torch.nn as nn\nfrom my_package.FilterInterpolation import  FilterInterpolationModule\nfrom my_package.FlowProjection import  FlowProjectionModule #,FlowFillholeModule\nfrom my_package.DepthFlowProjection import DepthFlowProjectionModule\n\nfrom Stack import Stack\n\nimport PWCNet\nimport S2D_models\nimport Resblock\nimport MegaDepth\nimport time\n\nclass DAIN(torch.nn.Module):\n    def __init__(self,\n                 channel = 3,\n                 filter_size = 4,\n                 timestep=0.5,\n                 training=True):\n\n        # base class initialization\n        super(DAIN, self).__init__()\n        \n        self.filter_size = filter_size\n        self.training = training\n        self.timestep = timestep\n        assert (timestep == 0.5) # TODO: or else the WeigtedFlowProjection should also be revised... Really Tedious work.\n        self.numFrames =int(1.0/timestep) - 1\n\n        i=0\n        self.initScaleNets_filter,self.initScaleNets_filter1,self.initScaleNets_filter2 = \\\n            self.get_MonoNet5(channel if i == 0 else channel + filter_size * filter_size, filter_size * filter_size, \"filter\")\n\n        self.ctxNet = S2D_models.__dict__['S2DF_3dense']()\n        self.ctx_ch = 3 * 64 + 3\n\n        self.rectifyNet = Resblock.__dict__['MultipleBasicBlock_4'](3 + 3 + 3 +2*1+ 2*2 +16*2+ 2 * self.ctx_ch,128)\n\n        self._initialize_weights()\n        \n        if self.training:\n            self.flownets = PWCNet.__dict__['pwc_dc_net'](\"PWCNet/pwc_net.pth.tar\")\n        else:\n            self.flownets = PWCNet.__dict__['pwc_dc_net']()\n        self.div_flow = 20.0\n\n        #extract depth information\n        if self.training:\n            self.depthNet=MegaDepth.__dict__['HourGlass'](\"MegaDepth/checkpoints/test_local/best_generalization_net_G.pth\")\n        else:\n            self.depthNet=MegaDepth.__dict__['HourGlass']()\n\n        return\n\n    def _initialize_weights(self):\n        count = 0\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d):\n                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n                # m.weight.data.normal_(0, math.sqrt(2. / n))\n                # print(m)\n                count+=1\n                # print(count)\n                # weight_init.xavier_uniform(m.weight.data)\n                nn.init.xavier_uniform_(m.weight.data)\n                # weight_init.kaiming_uniform(m.weight.data, a = 0, mode='fan_in')\n                if m.bias is not None:\n                    m.bias.data.zero_()\n            elif isinstance(m, nn.BatchNorm2d):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\n            elif isinstance(m, nn.Linear):\n                m.weight.data.normal_(0, 0.01)\n                m.bias.data.zero_()\n            # else:\n            #     print(m)\n\n\n    def forward(self, input):\n\n        \"\"\"\n        Parameters\n        ----------\n        input: shape (3, batch, 3, width, height)\n        -----------\n        \"\"\"\n        losses = []\n        offsets= []\n        filters = []\n        occlusions = []\n\n        device = torch.cuda.current_device()\n        # s1 = torch.cuda.Stream(device=device, priority=5)\n        # s2 = torch.cuda.Stream(device=device, priority=10) #PWC-Net is slow, need to have higher priority\n        s1 = torch.cuda.current_stream()\n        s2 = torch.cuda.current_stream()\n\n        '''\n            STEP 1: sequeeze the input \n        '''\n        if self.training == True:\n            assert input.size(0) == 3\n            input_0,input_1,input_2 = torch.squeeze(input,dim=0)\n        else:\n            assert input.size(0) ==2\n            input_0,input_2 = torch.squeeze(input,dim=0)\n\n\n        #prepare the input data of current scale\n        cur_input_0 = input_0\n        if self.training == True:\n            cur_input_1 = input_1\n        cur_input_2 =  input_2\n\n        '''\n            STEP 3.2: concatenating the inputs.\n        '''\n        cur_offset_input = torch.cat((cur_input_0, cur_input_2), dim=1)\n        cur_filter_input = cur_offset_input # torch.cat((cur_input_0, cur_input_2), dim=1)\n\n        '''\n            STEP 3.3: perform the estimation by the Three subpath Network \n        '''\n        time_offsets = [ kk * self.timestep for kk in range(1, 1+self.numFrames,1)]\n\n        with torch.cuda.stream(s1):\n            temp  = self.depthNet(torch.cat((cur_filter_input[:, :3, ...],\n                                             cur_filter_input[:, 3:, ...]),dim=0))\n            log_depth = [temp[:cur_filter_input.size(0)], temp[cur_filter_input.size(0):]]\n\n            cur_ctx_output = [\n                torch.cat((self.ctxNet(cur_filter_input[:, :3, ...]),\n                       log_depth[0].detach()), dim=1),\n                    torch.cat((self.ctxNet(cur_filter_input[:, 3:, ...]),\n                   log_depth[1].detach()), dim=1)\n                    ]\n            temp = self.forward_singlePath(self.initScaleNets_filter, cur_filter_input, 'filter')\n            cur_filter_output = [self.forward_singlePath(self.initScaleNets_filter1, temp, name=None),\n                             self.forward_singlePath(self.initScaleNets_filter2, temp, name=None)]\n\n\n            depth_inv = [1e-6 + 1 / torch.exp(d) for d in log_depth]\n\n        with torch.cuda.stream(s2):\n            for _ in range(1):\n                cur_offset_outputs = [\n                        self.forward_flownets(self.flownets, cur_offset_input, time_offsets=time_offsets),\n                        self.forward_flownets(self.flownets, torch.cat((cur_offset_input[:, 3:, ...],\n                                            cur_offset_input[:, 0:3, ...]), dim=1),\n                                  time_offsets=time_offsets[::-1])\n                        ]\n\n        torch.cuda.synchronize() #synchronize s1 and s2\n\n        cur_offset_outputs = [\n            self.FlowProject(cur_offset_outputs[0],depth_inv[0]),\n            self.FlowProject(cur_offset_outputs[1],depth_inv[1])\n                ]\n\n        '''\n            STEP 3.4: perform the frame interpolation process \n        '''\n        cur_offset_output = [cur_offset_outputs[0][0], cur_offset_outputs[1][0]]\n        ctx0,ctx2 = self.FilterInterpolate_ctx(cur_ctx_output[0],cur_ctx_output[1],\n                                                   cur_offset_output,cur_filter_output)\n\n        cur_output,ref0,ref2 = self.FilterInterpolate(cur_input_0, cur_input_2,cur_offset_output,cur_filter_output,self.filter_size**2)\n\n        rectify_input = torch.cat((cur_output,ref0,ref2,\n                                    cur_offset_output[0],cur_offset_output[1],\n                                    cur_filter_output[0],cur_filter_output[1],\n                                    ctx0,ctx2\n        ),dim =1)\n        cur_output_rectified = self.rectifyNet(rectify_input) + cur_output\n\n        '''\n            STEP 3.5: for training phase, we collect the variables to be penalized.\n        '''\n        if self.training == True:\n                losses +=[cur_output - cur_input_1]\n                losses += [cur_output_rectified - cur_input_1]                \n                offsets +=[cur_offset_output]\n                filters += [cur_filter_output]\n        '''\n            STEP 4: return the results\n        '''\n        if self.training == True:\n            # if in the training phase, we output the losses to be minimized.\n            # return losses, loss_occlusion\n            return losses, offsets,filters,occlusions\n        else:\n            cur_outputs = [cur_output,cur_output_rectified]\n            return cur_outputs,cur_offset_output,cur_filter_output\n\n    def forward_flownets(self, model, input, time_offsets = None):\n\n        if time_offsets == None :\n            time_offsets = [0.5]\n        elif type(time_offsets) == float:\n            time_offsets = [time_offsets]\n        elif type(time_offsets) == list:\n            pass\n        temp = model(input)  # this is a single direction motion results, but not a bidirectional one\n\n        temps = [self.div_flow * temp * time_offset for time_offset in time_offsets]# single direction to bidirection should haven it.\n        temps = [nn.Upsample(scale_factor=4, mode='bilinear')(temp)  for temp in temps]# nearest interpolation won't be better i think\n        return temps\n\n    '''keep this function'''\n    def forward_singlePath(self, modulelist, input, name):\n        stack = Stack()\n\n        k = 0\n        temp = []\n        for layers in modulelist:  # self.initScaleNets_offset:\n            # print(type(layers).__name__)\n            # print(k)\n            # if k == 27:\n            #     print(k)\n            #     pass\n            # use the pop-pull logic, looks like a stack.\n            if k == 0:\n                temp = layers(input)\n            else:\n                # met a pooling layer, take its input\n                if isinstance(layers, nn.AvgPool2d) or isinstance(layers,nn.MaxPool2d):\n                    stack.push(temp)\n\n                temp = layers(temp)\n\n                # met a unpooling layer, take its output\n                if isinstance(layers, nn.Upsample):\n                    if name == 'offset':\n                        temp = torch.cat((temp,stack.pop()),dim=1)  # short cut here, but optical flow should concat instead of add\n                    else:\n                        temp += stack.pop()  # short cut here, but optical flow should concat instead of add\n            k += 1\n        return temp\n\n    '''keep this funtion'''\n    def get_MonoNet5(self, channel_in, channel_out, name):\n\n        '''\n        Generally, the MonoNet is aimed to provide a basic module for generating either offset, or filter, or occlusion.\n\n        :param channel_in: number of channels that composed of multiple useful information like reference frame, previous coarser-scale result\n        :param channel_out: number of output the offset or filter or occlusion\n        :param name: to distinguish between offset, filter and occlusion, since they should use different activations in the last network layer\n\n        :return: output the network model\n        '''\n        model = []\n\n        # block1\n        model += self.conv_relu(channel_in * 2, 16, (3, 3), (1, 1))\n        model += self.conv_relu_maxpool(16, 32, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.5\n        # block2\n        model += self.conv_relu_maxpool(32, 64, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.4\n        # block3\n        model += self.conv_relu_maxpool(64, 128, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.3\n        # block4\n        model += self.conv_relu_maxpool(128, 256, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.2\n        # block5\n        model += self.conv_relu_maxpool(256, 512, (3, 3), (1, 1), (2, 2))\n\n        # intermediate block5_5\n        model += self.conv_relu(512, 512, (3, 3), (1, 1))\n\n        # block 6\n        model += self.conv_relu_unpool(512, 256, (3, 3), (1, 1), 2)  # THE OUTPUT No.1 UP\n        # block 7\n        model += self.conv_relu_unpool(256, 128, (3, 3), (1, 1), 2)  # THE OUTPUT No.2 UP\n        # block 8\n        model += self.conv_relu_unpool(128, 64, (3, 3), (1, 1), 2)  # THE OUTPUT No.3 UP\n\n        # block 9\n        model += self.conv_relu_unpool(64, 32, (3, 3), (1, 1), 2)  # THE OUTPUT No.4 UP\n\n        # block 10\n        model += self.conv_relu_unpool(32,  16, (3, 3), (1, 1), 2)  # THE OUTPUT No.5 UP\n\n        # output our final purpose\n        branch1 = []\n        branch2 = []\n        branch1 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))\n        branch2 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))\n\n        return  (nn.ModuleList(model), nn.ModuleList(branch1), nn.ModuleList(branch2))\n\n    '''keep this function'''\n    @staticmethod\n    def FlowProject(inputs, depth = None):\n        if depth is not None:\n            outputs = [DepthFlowProjectionModule(input.requires_grad)(input,depth) for input in inputs]\n        else:\n            outputs = [ FlowProjectionModule(input.requires_grad)(input) for input in inputs]\n        return outputs\n\n\n    '''keep this function'''\n    @staticmethod\n    def FilterInterpolate_ctx(ctx0,ctx2,offset,filter):\n        ##TODO: which way should I choose\n\n        ctx0_offset = FilterInterpolationModule()(ctx0,offset[0].detach(),filter[0].detach())\n        ctx2_offset = FilterInterpolationModule()(ctx2,offset[1].detach(),filter[1].detach())\n\n        return ctx0_offset, ctx2_offset\n        # ctx0_offset = FilterInterpolationModule()(ctx0.detach(), offset[0], filter[0])\n        # ctx2_offset = FilterInterpolationModule()(ctx2.detach(), offset[1], filter[1])\n        #\n        # return ctx0_offset, ctx2_offset\n    '''Keep this function'''\n    @staticmethod\n    def FilterInterpolate(ref0, ref2, offset, filter,filter_size2):\n        ref0_offset = FilterInterpolationModule()(ref0, offset[0],filter[0])\n        ref2_offset = FilterInterpolationModule()(ref2, offset[1],filter[1])\n        return ref0_offset/2.0 + ref2_offset/2.0, ref0_offset,ref2_offset\n\n    '''keep this function'''\n    @staticmethod\n    def conv_relu_conv(input_filter, output_filter, kernel_size,\n                        padding):\n\n        # we actually don't need to use so much layer in the last stages.\n        layers = nn.Sequential(\n            nn.Conv2d(input_filter, input_filter, kernel_size, 1, padding),\n            nn.ReLU(inplace=False),\n            nn.Conv2d(input_filter, output_filter, kernel_size, 1, padding),\n            # nn.ReLU(inplace=False),\n            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),\n            # nn.ReLU(inplace=False),\n            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),\n        )\n        return layers\n\n\n    '''keep this fucntion'''\n    @staticmethod\n    def conv_relu(input_filter, output_filter, kernel_size,\n                        padding):\n        layers = nn.Sequential(*[\n            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),\n\n            nn.ReLU(inplace=False)\n        ])\n        return layers\n\n    '''keep this function'''\n    @staticmethod\n    def conv_relu_maxpool(input_filter, output_filter, kernel_size,\n                            padding,kernel_size_pooling):\n\n        layers = nn.Sequential(*[\n            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),\n\n            nn.ReLU(inplace=False),\n\n            # nn.BatchNorm2d(output_filter),\n\n            nn.MaxPool2d(kernel_size_pooling)\n        ])\n        return layers\n\n    '''klkeep this function'''\n    @staticmethod\n    def conv_relu_unpool(input_filter, output_filter, kernel_size,\n                            padding,unpooling_factor):\n\n        layers = nn.Sequential(*[\n\n            nn.Upsample(scale_factor=unpooling_factor, mode='bilinear'),\n\n            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),\n\n            nn.ReLU(inplace=False),\n\n            # nn.BatchNorm2d(output_filter),\n\n\n            # nn.UpsamplingBilinear2d(unpooling_size,scale_factor=unpooling_size[0])\n        ])\n        return layers\n"
  },
  {
    "path": "networks/DAIN_slowmotion.py",
    "content": "# -*- coding: utf-8 -*-\nimport torch\nimport torch.nn as nn\nfrom my_package.FilterInterpolation import  FilterInterpolationModule\nfrom my_package.FlowProjection import  FlowProjectionModule #,FlowFillholeModule\nfrom my_package.DepthFlowProjection import DepthFlowProjectionModule\n\nfrom Stack import Stack\n\nimport PWCNet\nimport S2D_models\nimport Resblock\nimport MegaDepth\nimport time\n\nclass DAIN_slowmotion(torch.nn.Module):\n    def __init__(self,\n                 channel = 3,\n                 filter_size = 4,\n                 timestep=0.5,\n                 training=True):\n\n        # base class initialization\n        super(DAIN_slowmotion, self).__init__()\n        \n        self.filter_size = filter_size\n        self.training = training\n        self.timestep = timestep        \n        self.numFrames =int(1.0/timestep) - 1\n        print(\"Interpolate \" +str( self.numFrames )+ \" frames\")\n        i = 0\n        self.initScaleNets_filter,self.initScaleNets_filter1,self.initScaleNets_filter2 = \\\n            self.get_MonoNet5(channel if i == 0 else channel + filter_size * filter_size, filter_size * filter_size, \"filter\")\n\n        self.ctxNet = S2D_models.__dict__['S2DF_3dense']()\n        self.ctx_ch = 3 * 64 + 3\n\n        self.rectifyNet = Resblock.__dict__['MultipleBasicBlock_4'](3 + 3 + 3 +2*1+ 2*2 +16*2+ 2 * self.ctx_ch,128)\n\n        self._initialize_weights()\n        \n        if self.training:\n            self.flownets = PWCNet.__dict__['pwc_dc_net'](\"PWCNet/pwc_net.pth.tar\")\n        else:\n            self.flownets = PWCNet.__dict__['pwc_dc_net']()\n        self.div_flow = 20.0\n\n        #extract depth information\n        if self.training:\n            self.depthNet=MegaDepth.__dict__['HourGlass'](\"MegaDepth/checkpoints/test_local/best_generalization_net_G.pth\")\n        else:\n            self.depthNet=MegaDepth.__dict__['HourGlass']()\n\n        return\n\n    def _initialize_weights(self):\n        count = 0\n        for m in self.modules():\n            if isinstance(m, nn.Conv2d):\n                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels\n                # m.weight.data.normal_(0, math.sqrt(2. / n))\n                # print(m)\n                count+=1\n                # print(count)\n                # weight_init.xavier_uniform(m.weight.data)\n                nn.init.xavier_uniform_(m.weight.data)\n                # weight_init.kaiming_uniform(m.weight.data, a = 0, mode='fan_in')\n                if m.bias is not None:\n                    m.bias.data.zero_()\n            elif isinstance(m, nn.BatchNorm2d):\n                m.weight.data.fill_(1)\n                m.bias.data.zero_()\n            elif isinstance(m, nn.Linear):\n                m.weight.data.normal_(0, 0.01)\n                m.bias.data.zero_()\n            # else:\n            #     print(m)\n\n\n    def forward(self, input):\n\n        \"\"\"\n        Parameters\n        ----------\n        input: shape (3, batch, 3, width, height)\n        -----------\n        \"\"\"\n        losses = []\n        offsets= []\n        filters = []\n        occlusions = []\n\n        device = torch.cuda.current_device()\n        # s1 = torch.cuda.Stream(device=device, priority=5)\n        # s2 = torch.cuda.Stream(device=device, priority=10) #PWC-Net is slow, need to have higher priority\n        s1 = torch.cuda.current_stream()\n        s2 = torch.cuda.current_stream()\n\n        '''\n            STEP 1: sequeeze the input \n        '''\n        if self.training == True:\n            assert input.size(0) == 3\n            input_0,input_1,input_2 = torch.squeeze(input,dim=0)\n        else:\n            assert input.size(0) ==2\n            input_0,input_2 = torch.squeeze(input,dim=0)\n\n\n        #prepare the input data of current scale\n        cur_input_0 = input_0\n        if self.training == True:\n            cur_input_1 = input_1\n        cur_input_2 =  input_2\n\n        '''\n            STEP 3.2: concatenating the inputs.\n        '''\n        cur_offset_input = torch.cat((cur_input_0, cur_input_2), dim=1)\n        cur_filter_input = cur_offset_input # torch.cat((cur_input_0, cur_input_2), dim=1)\n\n        '''\n            STEP 3.3: perform the estimation by the Three subpath Network \n        '''\n        time_offsets = [ kk * self.timestep for kk in range(1, 1+self.numFrames,1)]\n\n        with torch.cuda.stream(s1):\n            temp  = self.depthNet(torch.cat((cur_filter_input[:, :3, ...],\n                                             cur_filter_input[:, 3:, ...]),dim=0))\n            log_depth = [temp[:cur_filter_input.size(0)], temp[cur_filter_input.size(0):]]\n\n            cur_ctx_output = [\n                torch.cat((self.ctxNet(cur_filter_input[:, :3, ...]),\n                       log_depth[0].detach()), dim=1),\n                    torch.cat((self.ctxNet(cur_filter_input[:, 3:, ...]),\n                   log_depth[1].detach()), dim=1)\n                    ]\n            temp = self.forward_singlePath(self.initScaleNets_filter, cur_filter_input, 'filter')\n            cur_filter_output = [self.forward_singlePath(self.initScaleNets_filter1, temp, name=None),\n                             self.forward_singlePath(self.initScaleNets_filter2, temp, name=None)]\n\n\n            depth_inv = [1e-6 + 1 / torch.exp(d) for d in log_depth]\n\n        with torch.cuda.stream(s2):\n            for _ in range(1):\n                cur_offset_outputs = [\n                        self.forward_flownets(self.flownets, cur_offset_input, time_offsets=time_offsets),\n                        self.forward_flownets(self.flownets, torch.cat((cur_offset_input[:, 3:, ...],\n                                            cur_offset_input[:, 0:3, ...]), dim=1),\n                                  time_offsets=[1 - t for t in time_offsets])\n                        ]\n\n        torch.cuda.synchronize() #synchronize s1 and s2\n\n        cur_offset_outputs = [\n            self.FlowProject(cur_offset_outputs[0],depth_inv[0]),\n            self.FlowProject(cur_offset_outputs[1],depth_inv[1])\n                ]\n\n        '''\n            STEP 3.4: perform the frame interpolation process \n        '''\n        cur_output_rectified = []\n        cur_output = []\n        \n        for temp_0,temp_1, timeoffset in zip(cur_offset_outputs[0], cur_offset_outputs[1], time_offsets):\n            cur_offset_output = [temp_0,temp_1] #[cur_offset_outputs[0][0], cur_offset_outputs[1][0]]\n            ctx0,ctx2 = self.FilterInterpolate_ctx(cur_ctx_output[0],cur_ctx_output[1],\n                               cur_offset_output,cur_filter_output, timeoffset)\n\n\n            cur_output_temp ,ref0,ref2 = self.FilterInterpolate(cur_input_0, cur_input_2,cur_offset_output,\n                                          cur_filter_output,self.filter_size**2, timeoffset)\n            cur_output.append(cur_output_temp)\n\n            rectify_input = torch.cat((cur_output_temp,ref0,ref2,\n                                        cur_offset_output[0],cur_offset_output[1],\n                                        cur_filter_output[0],cur_filter_output[1],\n                                        ctx0,ctx2\n                                        ),dim =1)\n            cur_output_rectified_temp = self.rectifyNet(rectify_input) + cur_output_temp\n            cur_output_rectified.append(cur_output_rectified_temp)\n\n        '''\n            STEP 3.5: for training phase, we collect the variables to be penalized.\n        '''\n        if self.training == True:\n                losses +=[cur_output - cur_input_1]\n                losses += [cur_output_rectified - cur_input_1]                \n                offsets +=[cur_offset_output]\n                filters += [cur_filter_output]\n        '''\n            STEP 4: return the results\n        '''\n        if self.training == True:\n            # if in the training phase, we output the losses to be minimized.\n            # return losses, loss_occlusion\n            return losses, offsets,filters,occlusions\n        else:\n            cur_outputs = [cur_output,cur_output_rectified]\n            return cur_outputs,cur_offset_output,cur_filter_output\n\n    def forward_flownets(self, model, input, time_offsets = None):\n\n        if time_offsets == None :\n            time_offsets = [0.5]\n        elif type(time_offsets) == float:\n            time_offsets = [time_offsets]\n        elif type(time_offsets) == list:\n            pass\n        temp = model(input)  # this is a single direction motion results, but not a bidirectional one\n\n        temps = [self.div_flow * temp * time_offset for time_offset in time_offsets]# single direction to bidirection should haven it.\n        temps = [nn.Upsample(scale_factor=4, mode='bilinear')(temp)  for temp in temps]# nearest interpolation won't be better i think\n        return temps\n\n    '''keep this function'''\n    def forward_singlePath(self, modulelist, input, name):\n        stack = Stack()\n\n        k = 0\n        temp = []\n        for layers in modulelist:  # self.initScaleNets_offset:\n            # print(type(layers).__name__)\n            # print(k)\n            # if k == 27:\n            #     print(k)\n            #     pass\n            # use the pop-pull logic, looks like a stack.\n            if k == 0:\n                temp = layers(input)\n            else:\n                # met a pooling layer, take its input\n                if isinstance(layers, nn.AvgPool2d) or isinstance(layers,nn.MaxPool2d):\n                    stack.push(temp)\n\n                temp = layers(temp)\n\n                # met a unpooling layer, take its output\n                if isinstance(layers, nn.Upsample):\n                    if name == 'offset':\n                        temp = torch.cat((temp,stack.pop()),dim=1)  # short cut here, but optical flow should concat instead of add\n                    else:\n                        temp += stack.pop()  # short cut here, but optical flow should concat instead of add\n            k += 1\n        return temp\n\n    '''keep this funtion'''\n    def get_MonoNet5(self, channel_in, channel_out, name):\n\n        '''\n        Generally, the MonoNet is aimed to provide a basic module for generating either offset, or filter, or occlusion.\n\n        :param channel_in: number of channels that composed of multiple useful information like reference frame, previous coarser-scale result\n        :param channel_out: number of output the offset or filter or occlusion\n        :param name: to distinguish between offset, filter and occlusion, since they should use different activations in the last network layer\n\n        :return: output the network model\n        '''\n        model = []\n\n        # block1\n        model += self.conv_relu(channel_in * 2, 16, (3, 3), (1, 1))\n        model += self.conv_relu_maxpool(16, 32, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.5\n        # block2\n        model += self.conv_relu_maxpool(32, 64, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.4\n        # block3\n        model += self.conv_relu_maxpool(64, 128, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.3\n        # block4\n        model += self.conv_relu_maxpool(128, 256, (3, 3), (1, 1), (2, 2))  # THE OUTPUT No.2\n        # block5\n        model += self.conv_relu_maxpool(256, 512, (3, 3), (1, 1), (2, 2))\n\n        # intermediate block5_5\n        model += self.conv_relu(512, 512, (3, 3), (1, 1))\n\n        # block 6\n        model += self.conv_relu_unpool(512, 256, (3, 3), (1, 1), 2)  # THE OUTPUT No.1 UP\n        # block 7\n        model += self.conv_relu_unpool(256, 128, (3, 3), (1, 1), 2)  # THE OUTPUT No.2 UP\n        # block 8\n        model += self.conv_relu_unpool(128, 64, (3, 3), (1, 1), 2)  # THE OUTPUT No.3 UP\n\n        # block 9\n        model += self.conv_relu_unpool(64, 32, (3, 3), (1, 1), 2)  # THE OUTPUT No.4 UP\n\n        # block 10\n        model += self.conv_relu_unpool(32,  16, (3, 3), (1, 1), 2)  # THE OUTPUT No.5 UP\n\n        # output our final purpose\n        branch1 = []\n        branch2 = []\n        branch1 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))\n        branch2 += self.conv_relu_conv(16, channel_out,  (3, 3), (1, 1))\n\n        return  (nn.ModuleList(model), nn.ModuleList(branch1), nn.ModuleList(branch2))\n\n    '''keep this function'''\n    @staticmethod\n    def FlowProject(inputs, depth = None):\n        if depth is not None:\n            outputs = [DepthFlowProjectionModule(input.requires_grad)(input,depth) for input in inputs]\n        else:\n            outputs = [ FlowProjectionModule(input.requires_grad)(input) for input in inputs]\n        return outputs\n\n\n    '''keep this function'''\n    @staticmethod\n    def FilterInterpolate_ctx(ctx0,ctx2,offset,filter, timeoffset):\n        ##TODO: which way should I choose\n\n        ctx0_offset = FilterInterpolationModule()(ctx0,offset[0].detach(),filter[0].detach())\n        ctx2_offset = FilterInterpolationModule()(ctx2,offset[1].detach(),filter[1].detach())\n\n        return ctx0_offset, ctx2_offset\n        # ctx0_offset = FilterInterpolationModule()(ctx0.detach(), offset[0], filter[0])\n        # ctx2_offset = FilterInterpolationModule()(ctx2.detach(), offset[1], filter[1])\n        #\n        # return ctx0_offset, ctx2_offset\n    '''Keep this function'''\n    @staticmethod\n    def FilterInterpolate(ref0, ref2, offset, filter,filter_size2, time_offset):\n        ref0_offset = FilterInterpolationModule()(ref0, offset[0],filter[0])\n        ref2_offset = FilterInterpolationModule()(ref2, offset[1],filter[1])\n\n        # occlusion0, occlusion2 = torch.split(occlusion, 1, dim=1)\n        # print((occlusion0[0,0,1,1] + occlusion2[0,0,1,1]))\n        # output = (occlusion0 * ref0_offset + occlusion2 * ref2_offset) / (occlusion0 + occlusion2)\n        # output = * ref0_offset + occlusion[1] * ref2_offset\n        # automatically broadcasting the occlusion to the three channels of and image.\n        # return output\n        # return ref0_offset/2.0 + ref2_offset/2.0, ref0_offset,ref2_offset\n        return ref0_offset*(1.0 - time_offset) + ref2_offset*(time_offset), ref0_offset, ref2_offset\n\n    '''keep this function'''\n    @staticmethod\n    def conv_relu_conv(input_filter, output_filter, kernel_size,\n                        padding):\n\n        # we actually don't need to use so much layer in the last stages.\n        layers = nn.Sequential(\n            nn.Conv2d(input_filter, input_filter, kernel_size, 1, padding),\n            nn.ReLU(inplace=False),\n            nn.Conv2d(input_filter, output_filter, kernel_size, 1, padding),\n            # nn.ReLU(inplace=False),\n            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),\n            # nn.ReLU(inplace=False),\n            # nn.Conv2d(output_filter, output_filter, kernel_size, 1, padding),\n        )\n        return layers\n\n\n    '''keep this fucntion'''\n    @staticmethod\n    def conv_relu(input_filter, output_filter, kernel_size,\n                        padding):\n        layers = nn.Sequential(*[\n            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),\n\n            nn.ReLU(inplace=False)\n        ])\n        return layers\n\n    '''keep this function'''\n    @staticmethod\n    def conv_relu_maxpool(input_filter, output_filter, kernel_size,\n                            padding,kernel_size_pooling):\n\n        layers = nn.Sequential(*[\n            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),\n\n            nn.ReLU(inplace=False),\n\n            # nn.BatchNorm2d(output_filter),\n\n            nn.MaxPool2d(kernel_size_pooling)\n        ])\n        return layers\n\n    '''klkeep this function'''\n    @staticmethod\n    def conv_relu_unpool(input_filter, output_filter, kernel_size,\n                            padding,unpooling_factor):\n\n        layers = nn.Sequential(*[\n\n            nn.Upsample(scale_factor=unpooling_factor, mode='bilinear'),\n\n            nn.Conv2d(input_filter,output_filter,kernel_size,1, padding),\n\n            nn.ReLU(inplace=False),\n\n            # nn.BatchNorm2d(output_filter),\n\n\n            # nn.UpsamplingBilinear2d(unpooling_size,scale_factor=unpooling_size[0])\n        ])\n        return layers"
  },
  {
    "path": "networks/__init__.py",
    "content": "from .DAIN import DAIN\nfrom .DAIN_slowmotion import DAIN_slowmotion\n__all__ = (\n           'DAIN',\n           'DAIN_slowmotion'\n)\n\n"
  },
  {
    "path": "train.py",
    "content": "import sys\nimport os\n\nimport threading\nimport torch\nfrom torch.autograd import Variable\nimport torch.utils.data\nfrom lr_scheduler import *\n\nimport numpy\nfrom AverageMeter import  *\nfrom loss_function import *\nimport datasets\nimport balancedsampler\nimport networks\nfrom my_args import args\n\n\n\ndef train():\n    torch.manual_seed(args.seed)\n\n    model = networks.__dict__[args.netName](channel=args.channels,\n                            filter_size = args.filter_size ,\n                            timestep=args.time_step,\n                            training=True)\n    if args.use_cuda:\n        print(\"Turn the model into CUDA\")\n        model = model.cuda()\n\n    if not args.SAVED_MODEL==None:\n        # args.SAVED_MODEL ='../model_weights/'+ args.SAVED_MODEL + \"/best\" + \".pth\"\n        args.SAVED_MODEL ='./model_weights/best.pth'\n        print(\"Fine tuning on \" +  args.SAVED_MODEL)\n        if not  args.use_cuda:\n            pretrained_dict = torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage)\n            # model.load_state_dict(torch.load(args.SAVED_MODEL, map_location=lambda storage, loc: storage))\n        else:\n            pretrained_dict = torch.load(args.SAVED_MODEL)\n            # model.load_state_dict(torch.load(args.SAVED_MODEL))\n        #print([k for k,v in      pretrained_dict.items()])\n\n        model_dict = model.state_dict()\n        # 1. filter out unnecessary keys\n        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}\n        # 2. overwrite entries in the existing state dict\n        model_dict.update(pretrained_dict)\n        # 3. load the new state dict\n        model.load_state_dict(model_dict)\n        pretrained_dict = None\n\n    if type(args.datasetName) == list:\n        train_sets, test_sets = [],[]\n        for ii, jj in zip(args.datasetName, args.datasetPath):\n            tr_s, te_s = datasets.__dict__[ii](jj, split = args.dataset_split,single = args.single_output, task = args.task)\n            train_sets.append(tr_s)\n            test_sets.append(te_s)\n        train_set = torch.utils.data.ConcatDataset(train_sets)\n        test_set = torch.utils.data.ConcatDataset(test_sets)\n    else:\n        train_set, test_set = datasets.__dict__[args.datasetName](args.datasetPath)\n    train_loader = torch.utils.data.DataLoader(\n        train_set, batch_size = args.batch_size,\n        sampler=balancedsampler.RandomBalancedSampler(train_set, int(len(train_set) / args.batch_size )),\n        num_workers= args.workers, pin_memory=True if args.use_cuda else False)\n\n    val_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size,\n                                             num_workers=args.workers, pin_memory=True if args.use_cuda else False)\n    print('{} samples found, {} train samples and {} test samples '.format(len(test_set)+len(train_set),\n                                                                           len(train_set),\n                                                                           len(test_set)))\n\n\n    # if not args.lr == 0:\n    print(\"train the interpolation net\")\n    optimizer = torch.optim.Adamax([\n                {'params': model.initScaleNets_filter.parameters(), 'lr': args.filter_lr_coe * args.lr},\n                {'params': model.initScaleNets_filter1.parameters(), 'lr': args.filter_lr_coe * args.lr},\n                {'params': model.initScaleNets_filter2.parameters(), 'lr': args.filter_lr_coe * args.lr},\n                {'params': model.ctxNet.parameters(), 'lr': args.ctx_lr_coe * args.lr},\n                {'params': model.flownets.parameters(), 'lr': args.flow_lr_coe * args.lr},\n                {'params': model.depthNet.parameters(), 'lr': args.depth_lr_coe * args.lr},\n                {'params': model.rectifyNet.parameters(), 'lr': args.rectify_lr}\n            ],\n                lr=args.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay)\n\n\n    scheduler = ReduceLROnPlateau(optimizer, 'min',factor=args.factor, patience=args.patience,verbose=True)\n\n    print(\"*********Start Training********\")\n    print(\"LR is: \"+ str(float(optimizer.param_groups[0]['lr'])))\n    print(\"EPOCH is: \"+ str(int(len(train_set) / args.batch_size )))\n    print(\"Num of EPOCH is: \"+ str(args.numEpoch))\n    def count_network_parameters(model):\n\n        parameters = filter(lambda p: p.requires_grad, model.parameters())\n        N = sum([numpy.prod(p.size()) for p in parameters])\n\n        return N\n    print(\"Num. of model parameters is :\" + str(count_network_parameters(model)))\n    if hasattr(model,'flownets'):\n        print(\"Num. of flow model parameters is :\" +\n              str(count_network_parameters(model.flownets)))\n    if hasattr(model,'initScaleNets_occlusion'):\n        print(\"Num. of initScaleNets_occlusion model parameters is :\" +\n              str(count_network_parameters(model.initScaleNets_occlusion) +\n                  count_network_parameters(model.initScaleNets_occlusion1) +\n        count_network_parameters(model.initScaleNets_occlusion2)))\n    if hasattr(model,'initScaleNets_filter'):\n        print(\"Num. of initScaleNets_filter model parameters is :\" +\n              str(count_network_parameters(model.initScaleNets_filter) +\n                  count_network_parameters(model.initScaleNets_filter1) +\n        count_network_parameters(model.initScaleNets_filter2)))\n    if hasattr(model, 'ctxNet'):\n        print(\"Num. of ctxNet model parameters is :\" +\n              str(count_network_parameters(model.ctxNet)))\n    if hasattr(model, 'depthNet'):\n        print(\"Num. of depthNet model parameters is :\" +\n              str(count_network_parameters(model.depthNet)))\n    if hasattr(model,'rectifyNet'):\n        print(\"Num. of rectifyNet model parameters is :\" +\n              str(count_network_parameters(model.rectifyNet)))\n\n    training_losses = AverageMeter()\n    auxiliary_data = []\n    saved_total_loss = 10e10\n    saved_total_PSNR = -1\n    ikk = 0\n    for kk in optimizer.param_groups:\n        if kk['lr'] > 0:\n            ikk = kk\n            break\n\n    for t in range(args.numEpoch):\n        print(\"The id of this in-training network is \" + str(args.uid))\n        print(args)\n        #Turn into training mode\n        model = model.train()\n\n        for i, (X0_half,X1_half, y_half) in enumerate(train_loader):\n\n            if i >= int(len(train_set) / args.batch_size ):\n                #(0 if t == 0 else EPOCH):#\n                break\n\n            X0_half = X0_half.cuda() if args.use_cuda else X0_half\n            X1_half = X1_half.cuda() if args.use_cuda else X1_half\n            y_half = y_half.cuda() if args.use_cuda else y_half\n\n            X0 = Variable(X0_half, requires_grad= False)\n            X1 = Variable(X1_half, requires_grad= False)\n            y  = Variable(y_half,requires_grad= False)\n\n            diffs, offsets,filters,occlusions = model(torch.stack((X0,y,X1),dim = 0))\n\n            pixel_loss, offset_loss, sym_loss = part_loss(diffs,offsets,occlusions, [X0,X1],epsilon=args.epsilon)\n\n            total_loss = sum(x*y if x > 0 else 0 for x,y in zip(args.alpha, pixel_loss))\n\n            training_losses.update(total_loss.item(), args.batch_size)\n            if i % max(1, int(int(len(train_set) / args.batch_size )/500.0)) == 0:\n\n                print(\"Ep [\" + str(t) +\"/\" + str(i) +\n                                    \"]\\tl.r.: \" + str(round(float(ikk['lr']),7))+\n                                    \"\\tPix: \" + str([round(x.item(),5) for x in pixel_loss]) +\n                                    \"\\tTV: \" + str([round(x.item(),4)  for x in offset_loss]) +\n                                    \"\\tSym: \" + str([round(x.item(), 4) for x in sym_loss]) +\n                                    \"\\tTotal: \" + str([round(x.item(),5) for x in [total_loss]]) +\n                                    \"\\tAvg. Loss: \" + str([round(training_losses.avg, 5)]))\n\n            optimizer.zero_grad()\n            total_loss.backward()\n            optimizer.step()\n\n        if t == 1:\n            # delete the pre validation weights for cleaner workspace\n            if os.path.exists(args.save_path + \"/epoch\" + str(0) +\".pth\" ):\n                os.remove(args.save_path + \"/epoch\" + str(0) +\".pth\")\n\n        if os.path.exists(args.save_path + \"/epoch\" + str(t-1) +\".pth\"):\n            os.remove(args.save_path + \"/epoch\" + str(t-1) +\".pth\")\n        torch.save(model.state_dict(), args.save_path + \"/epoch\" + str(t) +\".pth\")\n\n        # print(\"\\t\\t**************Start Validation*****************\")\n        #Turn into evaluation mode\n\n        val_total_losses = AverageMeter()\n        val_total_pixel_loss = AverageMeter()\n        val_total_PSNR_loss = AverageMeter()\n        val_total_tv_loss = AverageMeter()\n        val_total_pws_loss = AverageMeter()\n        val_total_sym_loss = AverageMeter()\n\n        for i, (X0,X1,y) in enumerate(val_loader):\n            if i >=  int(len(test_set)/ args.batch_size):\n                break\n\n            with torch.no_grad():\n                X0 = X0.cuda() if args.use_cuda else X0\n                X1 = X1.cuda() if args.use_cuda else X1\n                y = y.cuda() if args.use_cuda else y\n\n                diffs, offsets,filters,occlusions = model(torch.stack((X0,y,X1),dim = 0))\n\n                pixel_loss, offset_loss,sym_loss = part_loss(diffs, offsets, occlusions, [X0,X1],epsilon=args.epsilon)\n\n                val_total_loss = sum(x * y for x, y in zip(args.alpha, pixel_loss))\n\n                per_sample_pix_error = torch.mean(torch.mean(torch.mean(diffs[args.save_which] ** 2,\n                                                                    dim=1),dim=1),dim=1)\n                per_sample_pix_error = per_sample_pix_error.data # extract tensor\n                psnr_loss = torch.mean(20 * torch.log(1.0/torch.sqrt(per_sample_pix_error)))/torch.log(torch.Tensor([10]))\n                #\n\n                val_total_losses.update(val_total_loss.item(),args.batch_size)\n                val_total_pixel_loss.update(pixel_loss[args.save_which].item(), args.batch_size)\n                val_total_tv_loss.update(offset_loss[0].item(), args.batch_size)\n                val_total_sym_loss.update(sym_loss[0].item(), args.batch_size)\n                val_total_PSNR_loss.update(psnr_loss[0],args.batch_size)\n                print(\".\",end='',flush=True)\n\n        print(\"\\nEpoch \" + str(int(t)) +\n              \"\\tlearning rate: \" + str(float(ikk['lr'])) +\n              \"\\tAvg Training Loss: \" + str(round(training_losses.avg,5)) +\n              \"\\tValidate Loss: \" + str([round(float(val_total_losses.avg), 5)]) +\n              \"\\tValidate PSNR: \" + str([round(float(val_total_PSNR_loss.avg), 5)]) +\n              \"\\tPixel Loss: \" + str([round(float(val_total_pixel_loss.avg), 5)]) +\n              \"\\tTV Loss: \" + str([round(float(val_total_tv_loss.avg), 4)]) +\n              \"\\tPWS Loss: \" + str([round(float(val_total_pws_loss.avg), 4)]) +\n              \"\\tSym Loss: \" + str([round(float(val_total_sym_loss.avg), 4)])\n              )\n\n        auxiliary_data.append([t, float(ikk['lr']),\n                                   training_losses.avg, val_total_losses.avg, val_total_pixel_loss.avg,\n                                   val_total_tv_loss.avg,val_total_pws_loss.avg,val_total_sym_loss.avg])\n\n        numpy.savetxt(args.log, numpy.array(auxiliary_data), fmt='%.8f', delimiter=',')\n        training_losses.reset()\n\n        print(\"\\t\\tFinished an epoch, Check and Save the model weights\")\n            # we check the validation loss instead of training loss. OK~\n        if saved_total_loss >= val_total_losses.avg:\n            saved_total_loss = val_total_losses.avg\n            torch.save(model.state_dict(), args.save_path + \"/best\"+\".pth\")\n            print(\"\\t\\tBest Weights updated for decreased validation loss\\n\")\n\n        else:\n            print(\"\\t\\tWeights Not updated for undecreased validation loss\\n\")\n\n        #schdule the learning rate\n        scheduler.step(val_total_losses.avg)\n\n\n    print(\"*********Finish Training********\")\n\nif __name__ == '__main__':\n    sys.setrecursionlimit(100000)# 0xC00000FD exception for the recursive detach of gradients.\n    threading.stack_size(200000000)# 0xC00000FD exception for the recursive detach of gradients.\n    thread = threading.Thread(target=train)\n    thread.start()\n    thread.join()\n\n    exit(0)\n"
  }
]