Repository: visinf/irr
Branch: master
Commit: dacd07b1dc96
Files: 89
Total size: 240.7 MB

Directory structure:
gitextract_pwdsdpvy/

├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── augmentations.py
├── commandline.py
├── configuration.py
├── datasets/
│   ├── __init__.py
│   ├── common.py
│   ├── flyingThings3D.py
│   ├── flyingchairs.py
│   ├── flyingchairsOcc.py
│   ├── kitti_combined.py
│   ├── sintel.py
│   └── transforms.py
├── flyingchairsocc/
│   └── README.md
├── install.sh
├── logger.py
├── losses.py
├── main.py
├── models/
│   ├── IRR_FlowNet.py
│   ├── IRR_PWC.py
│   ├── __init__.py
│   ├── correlation_package/
│   │   ├── __init__.py
│   │   ├── correlation.py
│   │   ├── correlation_cuda.cc
│   │   ├── correlation_cuda_kernel.cu
│   │   ├── correlation_cuda_kernel.cuh
│   │   └── setup.py
│   ├── correlation_package_cu9/
│   │   ├── __init__.py
│   │   ├── correlation.py
│   │   ├── correlation_cuda.cc
│   │   ├── correlation_cuda_kernel.cu
│   │   ├── correlation_cuda_kernel.cuh
│   │   └── setup.py
│   ├── flownet1s.py
│   ├── flownet1s_irr.py
│   ├── flownet1s_irr_bi.py
│   ├── flownet1s_irr_occ.py
│   ├── flownet1s_irr_occ_bi.py
│   ├── flownet_modules.py
│   ├── irr_modules.py
│   ├── pwc_modules.py
│   ├── pwcnet.py
│   ├── pwcnet_bi.py
│   ├── pwcnet_irr.py
│   ├── pwcnet_irr_bi.py
│   ├── pwcnet_irr_occ.py
│   ├── pwcnet_irr_occ_bi.py
│   ├── pwcnet_occ.py
│   └── pwcnet_occ_bi.py
├── optim/
│   └── __init__.py
├── runtime.py
├── saved_check_point/
│   └── pwcnet/
│       ├── IRR-PWC_flyingchairsOcc/
│       │   ├── checkpoint_best.ckpt
│       │   └── checkpoint_latest.ckpt
│       ├── IRR-PWC_kitti/
│       │   ├── checkpoint_best.ckpt
│       │   └── checkpoint_latest.ckpt
│       ├── IRR-PWC_sintel/
│       │   ├── checkpoint_best.ckpt
│       │   └── checkpoint_latest.ckpt
│       ├── IRR-PWC_things3d/
│       │   ├── checkpoint_best.ckpt
│       │   └── checkpoint_latest.ckpt
│       ├── PWCNet/
│       │   └── checkpoint_best.ckpt
│       └── PWCNet-irr/
│           └── checkpoint_best.ckpt
├── scripts/
│   ├── IRR-FlowNet_flyingChairsOcc.sh
│   ├── IRR-PWC_flyingChairsOcc.sh
│   ├── IRR-PWC_kitti_train.sh
│   ├── IRR-PWC_kitti_train_full.sh
│   ├── IRR-PWC_sintel_train.sh
│   ├── IRR-PWC_sintel_train_full.sh
│   ├── IRR-PWC_things3d.sh
│   ├── flownet1s.sh
│   ├── flownet1s_irr1.sh
│   ├── flownet1s_irr2.sh
│   ├── pwcnet.sh
│   ├── pwcnet_irr.sh
│   └── validation/
│       ├── IRR-FlowNet_flyingChairs.sh
│       ├── IRR-PWC_flyingChairs.sh
│       ├── IRR-PWC_kitti.sh
│       ├── IRR-PWC_sintel.sh
│       ├── IRR-PWC_things3d.sh
│       ├── flownet1s.sh
│       ├── flownet1s_irr1.sh
│       ├── flownet1s_irr2.sh
│       ├── pwcnet.sh
│       └── pwcnet_irr.sh
├── tools.py
└── utils/
    ├── __init__.py
    ├── flow.py
    └── interpolation.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.pyc
*.so
*.o

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Iterative Residual Refinement <br/> for Joint Optical Flow and Occlusion Estimation

<img src=output.gif>

This repository is the PyTorch implementation of the paper:

**Iterative Residual Refinement for Joint Optical Flow and Occlusion Estimation (CVPR 2019)**  
[Junhwa Hur](https://sites.google.com/site/hurjunhwa) and [Stefan Roth](https://www.visinf.tu-darmstadt.de/team_members/sroth/sroth.en.jsp)  
Department of Computer Science, TU Darmstadt  
[[Preprint]](https://arxiv.org/pdf/1904.05290.pdf) &ensp; [[Proceeding]](http://openaccess.thecvf.com/content_CVPR_2019/papers/Hur_Iterative_Residual_Refinement_for_Joint_Optical_Flow_and_Occlusion_Estimation_CVPR_2019_paper.pdf) &ensp; [[Supplemental]](http://openaccess.thecvf.com/content_CVPR_2019/supplemental/Hur_Iterative_Residual_Refinement_CVPR_2019_supplemental.pdf)


Please cite the paper below if you find our paper and source codes are useful.  

    @inproceedings{Hur:2019:IRR,  
      Author = {Junhwa Hur and Stefan Roth},  
      Booktitle = {CVPR},  
      Title = {Iterative Residual Refinement for Joint Optical Flow and Occlusion Estimation},  
      Year = {2019}  
    }

Contact: junhwa.hur[at]visinf.tu-darmstadt.de

## Getting started
This code has been orginally developed under Anaconda(Python 3.6), PyTorch 0.4.1 and CUDA 8.0 on Ubuntu 16.04.

1. Please install the followings:

   - Anaconda
   - PyTorch (now compatible with __PyTorch 1.5.0__)
   - tqdm (`conda install -c conda-forge tqdm==4.40.0`)
   - (any missing packages that the code requires)


2. The datasets used for this projects are followings:
    - [FlyingChairsOcc dataset](https://github.com/visinf/irr/tree/master/flyingchairsocc)
    - [FlyingThings3D subset](https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html)
    - [MPI Sintel Dataset](http://sintel.is.tue.mpg.de/downloads) + [revised occlusion GT](https://download.visinf.tu-darmstadt.de/data/flyingchairs_occ/occlusions_rev.zip)
    - [KITTI Optical Flow 2015](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=flow) and [KITTI Optical Flow 2012](http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php?benchmark=flow)
    

## Training

The `scripts` folder contains training scripts of experiments demonstrated in the paper.  
To train the model, you can simply run the script file, e.g., `./IRR-PWC_flyingChairsOcc.sh`.  
In script files, please configure your own experiment directory (EXPERIMENTS_HOME) and dataset directory in your local system (e.g., SINTEL_HOME or KITTI_HOME).


## Pretrained Models

The `saved_check_point` contains the pretrained models of *i)* baseline, *ii)* baseline + irr, and *iii)* full models.  
Additional pretrained models in the ablations study (Table 1 in the main paper) and their training scripts are available upon request.

  
## Inference

The scripts for testing the pre-trained models are located in `scripts/validation`.


## Acknowledgement

Portions of the source code (e.g., training pipeline, runtime, argument parser, and logger) are from [Jochen Gast](https://scholar.google.com/citations?user=tmRcFacAAAAJ&hl=en)


================================================
FILE: __init__.py
================================================


================================================
FILE: augmentations.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from utils.interpolation import Interp2, Interp2MaskBinary
from utils.interpolation import Meshgrid
import numpy as np


def denormalize_coords(xx, yy, width, height):
    """ scale indices from [-1, 1] to [0, width/height] """
    xx = 0.5 * (width - 1.0) * (xx.float() + 1.0)
    yy = 0.5 * (height - 1.0) * (yy.float() + 1.0)
    return xx, yy


def normalize_coords(xx, yy, width, height):
    """ scale indices from [0, width/height] to [-1, 1] """
    xx = (2.0 / (width - 1.0)) * xx.float() - 1.0
    yy = (2.0 / (height - 1.0)) * yy.float() - 1.0
    return xx, yy


def apply_transform_to_params(theta0, theta_transform):
    a1 = theta0[:, 0]
    a2 = theta0[:, 1]
    a3 = theta0[:, 2]
    a4 = theta0[:, 3]
    a5 = theta0[:, 4]
    a6 = theta0[:, 5]
    #
    b1 = theta_transform[:, 0]
    b2 = theta_transform[:, 1]
    b3 = theta_transform[:, 2]
    b4 = theta_transform[:, 3]
    b5 = theta_transform[:, 4]
    b6 = theta_transform[:, 5]
    #
    c1 = a1 * b1 + a4 * b2
    c2 = a2 * b1 + a5 * b2
    c3 = b3 + a3 * b1 + a6 * b2
    c4 = a1 * b4 + a4 * b5
    c5 = a2 * b4 + a5 * b5
    c6 = b6 + a3 * b4 + a6 * b5
    #
    new_theta = torch.stack([c1, c2, c3, c4, c5, c6], dim=1)
    return new_theta


class _IdentityParams(nn.Module):
    def __init__(self):
        super(_IdentityParams, self).__init__()
        self._batch_size = 0
        self.register_buffer("_o", torch.FloatTensor())
        self.register_buffer("_i", torch.FloatTensor())

    def _update(self, batch_size):
        torch.zeros([batch_size, 1], out=self._o)
        torch.ones([batch_size, 1], out=self._i)
        return torch.cat([self._i, self._o, self._o, self._o, self._i, self._o], dim=1)

    def forward(self, batch_size):
        if self._batch_size != batch_size:
            self._identity_params = self._update(batch_size)
            self._batch_size = batch_size
        return self._identity_params


class RandomMirror(nn.Module):
    def __init__(self, vertical=True, p=0.5):
        super(RandomMirror, self).__init__()
        self._batch_size = 0
        self._p = p
        self._vertical = vertical
        self.register_buffer("_mirror_probs", torch.FloatTensor())

    def update_probs(self, batch_size):
        torch.ones([batch_size, 1], out=self._mirror_probs)
        self._mirror_probs *= self._p

    def forward(self, theta1, theta2):
        batch_size = theta1.size(0)
        if batch_size != self._batch_size:
            self.update_probs(batch_size)
            self._batch_size = batch_size

        # apply random sign to a1 a2 a3 (these are the guys responsible for x)
        sign = torch.sign(2.0 * torch.bernoulli(self._mirror_probs) - 1.0)
        i = torch.ones_like(sign)
        horizontal_mirror = torch.cat([sign, sign, sign, i, i, i], dim=1)
        theta1 *= horizontal_mirror
        theta2 *= horizontal_mirror

        # apply random sign to a4 a5 a6 (these are the guys responsible for y)
        if self._vertical:
            sign = torch.sign(2.0 * torch.bernoulli(self._mirror_probs) - 1.0)
            vertical_mirror = torch.cat([i, i, i, sign, sign, sign], dim=1)
            theta1 *= vertical_mirror
            theta2 *= vertical_mirror

        return theta1, theta2


class RandomCrop(nn.Module):
    """Crops the given PIL.Image at a random location to have a region of
    the given size. size can be a tuple (target_height, target_width)
    or an integer, in which case the target will be of a square shape (size, size)
    """

    def __init__(self, crop):
        super(RandomCrop, self).__init__()
        self._crop_size = crop
        self.register_buffer("_x", torch.LongTensor())
        self.register_buffer("_y", torch.LongTensor())

    def forward(self, im1, im2, flo):
        batch_size, _, height, width = im1.size()
        crop_height, crop_width = self._crop_size

        # check whether there is anything to do
        if any(self._size < 1):
            return im1, im2, flo

        # get starting positions
        self._x.random_(0, width - crop_width)
        self._y.random_(0, height - crop_height)

        im1 = im1[:, :, self._y:self._y + crop_height, self._x:self._x + crop_width]
        im2 = im2[:, :, self._y:self._y + crop_height, self._x:self._x + crop_width]
        flo = flo[:, :, self._y:self._y + crop_height, self._x:self._x + crop_width]


class RandomAffineFlow(nn.Module):
    def __init__(self, args, addnoise=True):
        super(RandomAffineFlow, self).__init__()
        self._args = args
        self._interp2 = Interp2(clamp=False)
        self._flow_interp2 = Interp2(clamp=False)
        self._meshgrid = Meshgrid()
        self._identity = _IdentityParams()
        self._random_mirror = RandomMirror()
        self._addnoise = addnoise
        self.register_buffer("_noise1", torch.FloatTensor())
        self.register_buffer("_noise2", torch.FloatTensor())
        self.register_buffer("_xbounds", torch.FloatTensor([-1, -1, 1, 1]))
        self.register_buffer("_ybounds", torch.FloatTensor([-1, 1, -1, 1]))

    def inverse_transform_coords(self, width, height, thetas, offset_x=None, offset_y=None):
        xx, yy = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)

        xx = torch.unsqueeze(xx, dim=0).float()
        yy = torch.unsqueeze(yy, dim=0).float()

        if offset_x is not None:
            xx = xx + offset_x
        if offset_y is not None:
            yy = yy + offset_y

        a1 = thetas[:, 0].contiguous().view(-1, 1, 1)
        a2 = thetas[:, 1].contiguous().view(-1, 1, 1)
        a3 = thetas[:, 2].contiguous().view(-1, 1, 1)
        a4 = thetas[:, 3].contiguous().view(-1, 1, 1)
        a5 = thetas[:, 4].contiguous().view(-1, 1, 1)
        a6 = thetas[:, 5].contiguous().view(-1, 1, 1)

        xx, yy = normalize_coords(xx, yy, width=width, height=height)
        xq = a1 * xx + a2 * yy + a3
        yq = a4 * xx + a5 * yy + a6
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def transform_coords(self, width, height, thetas):
        xx1, yy1 = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)
        xx, yy = normalize_coords(xx1, yy1, width=width, height=height)

        def _unsqueeze12(u):
            return torch.unsqueeze(torch.unsqueeze(u, dim=1), dim=1)

        a1 = _unsqueeze12(thetas[:, 0])
        a2 = _unsqueeze12(thetas[:, 1])
        a3 = _unsqueeze12(thetas[:, 2])
        a4 = _unsqueeze12(thetas[:, 3])
        a5 = _unsqueeze12(thetas[:, 4])
        a6 = _unsqueeze12(thetas[:, 5])
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = xx - a3
        yhat = yy - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat

        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def find_invalid(self, width, height, thetas):
        x = self._xbounds
        y = self._ybounds
        #
        a1 = torch.unsqueeze(thetas[:, 0], dim=1)
        a2 = torch.unsqueeze(thetas[:, 1], dim=1)
        a3 = torch.unsqueeze(thetas[:, 2], dim=1)
        a4 = torch.unsqueeze(thetas[:, 3], dim=1)
        a5 = torch.unsqueeze(thetas[:, 4], dim=1)
        a6 = torch.unsqueeze(thetas[:, 5], dim=1)
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = x - a3
        yhat = y - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        #
        invalid = (
                      (xq < 0) | (yq < 0) | (xq >= width) | (yq >= height)
                  ).sum(dim=1, keepdim=True) > 0

        return invalid

    def apply_random_transforms_to_params(self,
                                          theta0,
                                          max_translate,
                                          min_zoom, max_zoom,
                                          min_squeeze, max_squeeze,
                                          min_rotate, max_rotate,
                                          validate_size=None):
        max_translate *= 0.5
        batch_size = theta0.size(0)
        height, width = validate_size

        # collect valid params here
        thetas = torch.zeros_like(theta0)

        zoom = theta0.new(batch_size, 1).zero_()
        squeeze = torch.zeros_like(zoom)
        tx = torch.zeros_like(zoom)
        ty = torch.zeros_like(zoom)
        phi = torch.zeros_like(zoom)
        invalid = torch.ones_like(zoom).byte()

        while invalid.sum() > 0:
            # random sampling
            zoom.uniform_(min_zoom, max_zoom)
            squeeze.uniform_(min_squeeze, max_squeeze)
            tx.uniform_(-max_translate, max_translate)
            ty.uniform_(-max_translate, max_translate)
            phi.uniform_(min_rotate, max_rotate)

            # construct affine parameters
            sx = zoom * squeeze
            sy = zoom / squeeze
            sin_phi = torch.sin(phi)
            cos_phi = torch.cos(phi)
            b1 = cos_phi * sx
            b2 = sin_phi * sy
            b3 = tx
            b4 = - sin_phi * sx
            b5 = cos_phi * sy
            b6 = ty

            theta_transform = torch.cat([b1, b2, b3, b4, b5, b6], dim=1)
            theta_try = apply_transform_to_params(theta0, theta_transform)
            thetas = invalid.float() * theta_try + (1 - invalid.float()) * thetas

            # compute new invalid ones
            invalid = self.find_invalid(width=width, height=height, thetas=thetas)

        # here we should have good thetas within borders
        return thetas

    def transform_image(self, images, thetas):
        batch_size, channels, height, width = images.size()
        xq, yq = self.transform_coords(width=width, height=height, thetas=thetas)
        transformed = self._interp2(images, xq, yq)
        return transformed

    def transform_flow(self, flow, theta1, theta2):
        batch_size, channels, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]

        # inverse transform coords
        x0, y0 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta1)

        x1, y1 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta2, offset_x=u, offset_y=v)

        # subtract and create new flow
        u = x1 - x0
        v = y1 - y0
        new_flow = torch.stack([u, v], dim=1)

        # transform coords
        xq, yq = self.transform_coords(width=width, height=height, thetas=theta1)

        # interp2
        transformed = self._flow_interp2(new_flow, xq, yq)
        return transformed

    def forward(self, example_dict):
        im1 = example_dict["input1"]
        im2 = example_dict["input2"]
        flo = example_dict["target1"]

        batch_size = im1.size(0)
        height = im1.size(2)
        width = im1.size(3)

        # identity = no transform
        theta0 = self._identity(batch_size)

        # # global transform
        theta1 = self.apply_random_transforms_to_params(
            theta0,
            max_translate=0.2,
            min_zoom=1.0, max_zoom=1.5,
            min_squeeze=0.86, max_squeeze=1.16,
            min_rotate=-0.2, max_rotate=0.2,
            validate_size=[height, width])

        # relative transform
        theta2 = self.apply_random_transforms_to_params(
            theta1,
            max_translate=0.015,
            min_zoom=0.985, max_zoom=1.015,
            min_squeeze=1.0, max_squeeze=1.0,
            min_rotate=-0.015, max_rotate=0.015,
            validate_size=[height, width])

        # random flip images
        theta1, theta2 = self._random_mirror(theta1, theta2)

        im1 = self.transform_image(im1, theta1)
        im2 = self.transform_image(im2, theta2)
        flo = self.transform_flow(flo, theta1, theta2)

        if self._addnoise:
            stddev = np.random.uniform(0.0, 0.04)
            self._noise1.resize_as_(im1)
            self._noise2.resize_as_(im2)
            self._noise1.normal_(std=stddev)
            self._noise2.normal_(std=stddev)
            im1 += self._noise1
            im2 += self._noise2
            im1.clamp_(0.0, 1.0)
            im2.clamp_(0.0, 1.0)

        # construct updated dictionaries
        example_dict["input1"] = im1
        example_dict["input2"] = im2
        example_dict["target1"] = flo

        return example_dict


class RandomAffineFlowOcc(nn.Module):
    def __init__(self, args, addnoise=True, crop=None):
        super(RandomAffineFlowOcc, self).__init__()
        self._args = args
        self._interp2 = Interp2(clamp=False)
        self._flow_interp2 = Interp2(clamp=False)
        self._meshgrid = Meshgrid()
        self._identity = _IdentityParams()
        self._random_mirror = RandomMirror()
        self._addnoise = addnoise
        self._crop = crop

        self.register_buffer("_noise1", torch.FloatTensor())
        self.register_buffer("_noise2", torch.FloatTensor())
        self.register_buffer("_xbounds", torch.FloatTensor([-1, -1, 1, 1]))
        self.register_buffer("_ybounds", torch.FloatTensor([-1, 1, -1, 1]))
        self.register_buffer("_x", torch.IntTensor(1))
        self.register_buffer("_y", torch.IntTensor(1))

    def inverse_transform_coords(self, width, height, thetas, offset_x=None, offset_y=None):
        xx, yy = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)

        xx = torch.unsqueeze(xx, dim=0).float()
        yy = torch.unsqueeze(yy, dim=0).float()

        if offset_x is not None:
            xx = xx + offset_x
        if offset_y is not None:
            yy = yy + offset_y

        a1 = thetas[:, 0].contiguous().view(-1, 1, 1)
        a2 = thetas[:, 1].contiguous().view(-1, 1, 1)
        a3 = thetas[:, 2].contiguous().view(-1, 1, 1)
        a4 = thetas[:, 3].contiguous().view(-1, 1, 1)
        a5 = thetas[:, 4].contiguous().view(-1, 1, 1)
        a6 = thetas[:, 5].contiguous().view(-1, 1, 1)

        xx, yy = normalize_coords(xx, yy, width=width, height=height)
        xq = a1 * xx + a2 * yy + a3
        yq = a4 * xx + a5 * yy + a6
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def transform_coords(self, width, height, thetas):
        xx1, yy1 = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)
        xx, yy = normalize_coords(xx1, yy1, width=width, height=height)

        def _unsqueeze12(u):
            return torch.unsqueeze(torch.unsqueeze(u, dim=1), dim=1)

        a1 = _unsqueeze12(thetas[:, 0])
        a2 = _unsqueeze12(thetas[:, 1])
        a3 = _unsqueeze12(thetas[:, 2])
        a4 = _unsqueeze12(thetas[:, 3])
        a5 = _unsqueeze12(thetas[:, 4])
        a6 = _unsqueeze12(thetas[:, 5])
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = xx - a3
        yhat = yy - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat

        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def find_invalid(self, width, height, thetas):
        x = self._xbounds
        y = self._ybounds
        #
        a1 = torch.unsqueeze(thetas[:, 0], dim=1)
        a2 = torch.unsqueeze(thetas[:, 1], dim=1)
        a3 = torch.unsqueeze(thetas[:, 2], dim=1)
        a4 = torch.unsqueeze(thetas[:, 3], dim=1)
        a5 = torch.unsqueeze(thetas[:, 4], dim=1)
        a6 = torch.unsqueeze(thetas[:, 5], dim=1)
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = x - a3
        yhat = y - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        #
        invalid = (
                      (xq < 0) | (yq < 0) | (xq >= width) | (yq >= height)
                  ).sum(dim=1, keepdim=True) > 0

        return invalid

    def apply_random_transforms_to_params(self,
                                          theta0,
                                          max_translate,
                                          min_zoom, max_zoom,
                                          min_squeeze, max_squeeze,
                                          min_rotate, max_rotate,
                                          validate_size=None):
        max_translate *= 0.5
        batch_size = theta0.size(0)
        height, width = validate_size

        # collect valid params here
        thetas = torch.zeros_like(theta0)

        zoom = theta0.new(batch_size, 1).zero_()
        squeeze = torch.zeros_like(zoom)
        tx = torch.zeros_like(zoom)
        ty = torch.zeros_like(zoom)
        phi = torch.zeros_like(zoom)
        invalid = torch.ones_like(zoom).byte()

        while invalid.sum() > 0:
            # random sampling
            zoom.uniform_(min_zoom, max_zoom)
            squeeze.uniform_(min_squeeze, max_squeeze)
            tx.uniform_(-max_translate, max_translate)
            ty.uniform_(-max_translate, max_translate)
            phi.uniform_(min_rotate, max_rotate)

            # construct affine parameters
            sx = zoom * squeeze
            sy = zoom / squeeze
            sin_phi = torch.sin(phi)
            cos_phi = torch.cos(phi)
            b1 = cos_phi * sx
            b2 = sin_phi * sy
            b3 = tx
            b4 = - sin_phi * sx
            b5 = cos_phi * sy
            b6 = ty

            theta_transform = torch.cat([b1, b2, b3, b4, b5, b6], dim=1)
            theta_try = apply_transform_to_params(theta0, theta_transform)
            thetas = invalid.float() * theta_try + (1. - invalid.float()) * thetas

            # compute new invalid ones
            invalid = self.find_invalid(width=width, height=height, thetas=thetas)

        # here we should have good thetas within borders
        return thetas

    def transform_image(self, images, thetas):
        batch_size, channels, height, width = images.size()
        xq, yq = self.transform_coords(width=width, height=height, thetas=thetas)
        transformed = self._interp2(images, xq, yq)
        return transformed

    def transform_flow(self, flow, theta1, theta2):
        batch_size, channels, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]

        # inverse transform coords
        x0, y0 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta1)

        x1, y1 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta2, offset_x=u, offset_y=v)

        # subtract and create new flow
        u = x1 - x0
        v = y1 - y0
        new_flow = torch.stack([u, v], dim=1)

        # transform coords
        xq, yq = self.transform_coords(width=width, height=height, thetas=theta1)

        # interp2
        transformed = self._flow_interp2(new_flow, xq, yq)
        return transformed

    def check_out_of_bound(self, flow, occ, batch_size):
        _, _, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]
        xx, yy = self._meshgrid(width=width, height=height, device=flow.device, dtype=flow.dtype)
        xx = torch.unsqueeze(xx, dim=0).float()
        yy = torch.unsqueeze(yy, dim=0).float()
        xx = xx.expand(batch_size, -1, -1) + u
        yy = yy.expand(batch_size, -1, -1) + v

        out_of_bound = ((xx < 0) | (yy < 0) | (xx >= width) | (yy >= height)).float().unsqueeze(1)
        occ = torch.clamp(out_of_bound + occ, 0, 1)

        return occ

    def random_crop(self, im1, im2, flo_f, flo_b, occ1, occ2):

        _, _, height, width = im1.size()
        crop_height, crop_width = self._crop

        # get starting positions
        self._x.random_(0, width - crop_width + 1)
        self._y.random_(0, height - crop_height + 1)
        str_x = int(self._x)
        str_y = int(self._y)
        end_x = int(self._x + crop_width)
        end_y = int(self._y + crop_height)

        im1 = im1[:, :, str_y:end_y, str_x:end_x]
        im2 = im2[:, :, str_y:end_y, str_x:end_x]
        flo_f = flo_f[:, :, str_y:end_y, str_x:end_x]
        flo_b = flo_b[:, :, str_y:end_y, str_x:end_x]
        occ1 = occ1[:, :, str_y:end_y, str_x:end_x]
        occ2 = occ2[:, :, str_y:end_y, str_x:end_x]

        return im1, im2, flo_f, flo_b, occ1, occ2

    def forward(self, example_dict):
        im1 = example_dict["input1"]
        im2 = example_dict["input2"]
        flo_f = example_dict["target1"]
        flo_b = example_dict["target2"]
        occ1 = example_dict["target_occ1"]
        occ2 = example_dict["target_occ2"]

        batch_size = im1.size(0)
        height = im1.size(2)
        width = im1.size(3)

        # identity = no transform
        theta0 = self._identity(batch_size)

        # # global transform
        theta1 = self.apply_random_transforms_to_params(
            theta0,
            max_translate=0.2,
            min_zoom=1.0, max_zoom=1.5,
            min_squeeze=0.86, max_squeeze=1.16,
            min_rotate=-0.2, max_rotate=0.2,
            validate_size=[height, width])

        # relative transform
        theta2 = self.apply_random_transforms_to_params(
            theta1,
            max_translate=0.015,
            min_zoom=0.985, max_zoom=1.015,
            min_squeeze=1.0, max_squeeze=1.0,
            min_rotate=-0.015, max_rotate=0.015,
            validate_size=[height, width])

        # random flip images
        theta1, theta2 = self._random_mirror(theta1, theta2)

        im1 = self.transform_image(im1, theta1)
        im2 = self.transform_image(im2, theta2)
        flo_f = self.transform_flow(flo_f, theta1, theta2)
        flo_b = self.transform_flow(flo_b, theta2, theta1)
        occ1 = self.transform_image(occ1, theta1)
        occ2 = self.transform_image(occ2, theta2)

        if self._addnoise:
            stddev = np.random.uniform(0.0, 0.04)
            self._noise1.resize_as_(im1)
            self._noise2.resize_as_(im2)
            self._noise1.normal_(std=stddev)
            self._noise2.normal_(std=stddev)
            im1 += self._noise1
            im2 += self._noise2
            im1.clamp_(0.0, 1.0)
            im2.clamp_(0.0, 1.0)

        if self._crop is not None:
            im1, im2, flo_f, flo_b, occ1, occ2 = self.random_crop(im1, im2, flo_f, flo_b, occ1, occ2)

        occ1 = self.check_out_of_bound(flo_f, occ1, batch_size)
        occ2 = self.check_out_of_bound(flo_b, occ2, batch_size)

        example_dict["input1"] = im1
        example_dict["input2"] = im2
        example_dict["target1"] = flo_f
        example_dict["target2"] = flo_b
        example_dict["target_occ1"] = occ1
        example_dict["target_occ2"] = occ2

        return example_dict


class RandomAffineFlowOccSintel(nn.Module):
    def __init__(self, args, addnoise=True, crop=None):
        super(RandomAffineFlowOccSintel, self).__init__()
        self._args = args
        self._interp2 = Interp2(clamp=False)
        self._flow_interp2 = Interp2(clamp=False)
        self._meshgrid = Meshgrid()
        self._identity = _IdentityParams()
        self._random_mirror = RandomMirror()
        self._addnoise = addnoise
        self._crop = crop

        self.register_buffer("_noise1", torch.FloatTensor())
        self.register_buffer("_noise2", torch.FloatTensor())
        self.register_buffer("_xbounds", torch.FloatTensor([-1, -1, 1, 1]))
        self.register_buffer("_ybounds", torch.FloatTensor([-1, 1, -1, 1]))
        self.register_buffer("_x", torch.IntTensor(1))
        self.register_buffer("_y", torch.IntTensor(1))

    def inverse_transform_coords(self, width, height, thetas, offset_x=None, offset_y=None):
        xx, yy = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)

        xx = torch.unsqueeze(xx, dim=0).float()
        yy = torch.unsqueeze(yy, dim=0).float()

        if offset_x is not None:
            xx = xx + offset_x
        if offset_y is not None:
            yy = yy + offset_y

        a1 = thetas[:, 0].contiguous().view(-1, 1, 1)
        a2 = thetas[:, 1].contiguous().view(-1, 1, 1)
        a3 = thetas[:, 2].contiguous().view(-1, 1, 1)
        a4 = thetas[:, 3].contiguous().view(-1, 1, 1)
        a5 = thetas[:, 4].contiguous().view(-1, 1, 1)
        a6 = thetas[:, 5].contiguous().view(-1, 1, 1)

        xx, yy = normalize_coords(xx, yy, width=width, height=height)
        xq = a1 * xx + a2 * yy + a3
        yq = a4 * xx + a5 * yy + a6
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def transform_coords(self, width, height, thetas):
        xx1, yy1 = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)
        xx, yy = normalize_coords(xx1, yy1, width=width, height=height)

        def _unsqueeze12(u):
            return torch.unsqueeze(torch.unsqueeze(u, dim=1), dim=1)

        a1 = _unsqueeze12(thetas[:, 0])
        a2 = _unsqueeze12(thetas[:, 1])
        a3 = _unsqueeze12(thetas[:, 2])
        a4 = _unsqueeze12(thetas[:, 3])
        a5 = _unsqueeze12(thetas[:, 4])
        a6 = _unsqueeze12(thetas[:, 5])
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = xx - a3
        yhat = yy - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat

        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def find_invalid(self, width, height, thetas):
        x = self._xbounds
        y = self._ybounds
        #
        a1 = torch.unsqueeze(thetas[:, 0], dim=1)
        a2 = torch.unsqueeze(thetas[:, 1], dim=1)
        a3 = torch.unsqueeze(thetas[:, 2], dim=1)
        a4 = torch.unsqueeze(thetas[:, 3], dim=1)
        a5 = torch.unsqueeze(thetas[:, 4], dim=1)
        a6 = torch.unsqueeze(thetas[:, 5], dim=1)
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = x - a3
        yhat = y - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        #
        invalid = (
                      (xq < 0) | (yq < 0) | (xq >= width) | (yq >= height)
                  ).sum(dim=1, keepdim=True) > 0

        return invalid

    def apply_random_transforms_to_params(self,
                                          theta0,
                                          max_translate,
                                          min_zoom, max_zoom,
                                          min_squeeze, max_squeeze,
                                          min_rotate, max_rotate,
                                          validate_size=None):
        max_translate *= 0.5
        batch_size = theta0.size(0)
        height, width = validate_size

        # collect valid params here
        thetas = torch.zeros_like(theta0)

        zoom = theta0.new(batch_size, 1).zero_()
        squeeze = torch.zeros_like(zoom)
        tx = torch.zeros_like(zoom)
        ty = torch.zeros_like(zoom)
        phi = torch.zeros_like(zoom)
        invalid = torch.ones_like(zoom).byte()

        while invalid.sum() > 0:
            # random sampling
            zoom.uniform_(min_zoom, max_zoom)
            squeeze.uniform_(min_squeeze, max_squeeze)
            tx.uniform_(-max_translate, max_translate)
            ty.uniform_(-max_translate, max_translate)
            phi.uniform_(min_rotate, max_rotate)

            # construct affine parameters
            sx = zoom * squeeze
            sy = zoom / squeeze
            sin_phi = torch.sin(phi)
            cos_phi = torch.cos(phi)
            b1 = cos_phi * sx
            b2 = sin_phi * sy
            b3 = tx
            b4 = - sin_phi * sx
            b5 = cos_phi * sy
            b6 = ty

            theta_transform = torch.cat([b1, b2, b3, b4, b5, b6], dim=1)
            theta_try = apply_transform_to_params(theta0, theta_transform)
            thetas = invalid.float() * theta_try + (1 - invalid.float()) * thetas

            # compute new invalid ones
            invalid = self.find_invalid(width=width, height=height, thetas=thetas)

        # here we should have good thetas within borders
        return thetas

    def transform_image(self, images, thetas):
        batch_size, channels, height, width = images.size()
        xq, yq = self.transform_coords(width=width, height=height, thetas=thetas)
        transformed = self._interp2(images, xq, yq)
        return transformed

    def transform_flow(self, flow, theta1, theta2):
        batch_size, channels, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]

        # inverse transform coords
        x0, y0 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta1)

        x1, y1 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta2, offset_x=u, offset_y=v)

        # subtract and create new flow
        u = x1 - x0
        v = y1 - y0
        new_flow = torch.stack([u, v], dim=1)

        # transform coords
        xq, yq = self.transform_coords(width=width, height=height, thetas=theta1)

        # interp2
        transformed = self._flow_interp2(new_flow, xq, yq)
        return transformed

    def check_out_of_bound(self, flow, occ, batch_size):
        _, _, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]
        xx, yy = self._meshgrid(width=width, height=height, device=flow.device, dtype=flow.dtype)
        xx = torch.unsqueeze(xx, dim=0)
        yy = torch.unsqueeze(yy, dim=0)
        xx = xx.expand(batch_size, -1, -1) + u
        yy = yy.expand(batch_size, -1, -1) + v

        out_of_bound = ((xx < 0) | (yy < 0) | (xx >= width) | (yy >= height)).float().unsqueeze(1)
        occ = torch.clamp(out_of_bound + occ, 0, 1)

        return occ

    def random_crop(self, im1, im2, flo_f, occ1):

        _, _, height, width = im1.size()
        crop_height, crop_width = self._crop

        # get starting positions
        self._x.random_(0, width - crop_width + 1)
        self._y.random_(0, height - crop_height + 1)
        str_x = int(self._x)
        str_y = int(self._y)
        end_x = int(self._x + crop_width)
        end_y = int(self._y + crop_height)

        im1 = im1[:, :, str_y:end_y, str_x:end_x]
        im2 = im2[:, :, str_y:end_y, str_x:end_x]
        flo_f = flo_f[:, :, str_y:end_y, str_x:end_x]
        occ1 = occ1[:, :, str_y:end_y, str_x:end_x]

        return im1, im2, flo_f, occ1

    def forward(self, example_dict):
        im1 = example_dict["input1"]
        im2 = example_dict["input2"]
        flo_f = example_dict["target1"]
        occ1 = example_dict["target_occ1"]

        batch_size = im1.size(0)
        height = im1.size(2)
        width = im1.size(3)

        # identity = no transform
        theta0 = self._identity(batch_size)

        # # global transform
        theta1 = self.apply_random_transforms_to_params(
            theta0,
            max_translate=0.2,
            min_zoom=1.0, max_zoom=1.5,
            min_squeeze=0.86, max_squeeze=1.16,
            min_rotate=-0.2, max_rotate=0.2,
            validate_size=[height, width])

        # relative transform
        theta2 = self.apply_random_transforms_to_params(
            theta1,
            max_translate=0.015,
            min_zoom=0.985, max_zoom=1.015,
            min_squeeze=1.0, max_squeeze=1.0,
            min_rotate=-0.015, max_rotate=0.015,
            validate_size=[height, width])

        # random flip images
        theta1, theta2 = self._random_mirror(theta1, theta2)

        im1 = self.transform_image(im1, theta1)
        im2 = self.transform_image(im2, theta2)
        flo_f = self.transform_flow(flo_f, theta1, theta2)
        occ1 = self.transform_image(occ1, theta1)

        if self._addnoise:
            stddev = np.random.uniform(0.0, 0.04)
            self._noise1.resize_as_(im1)
            self._noise2.resize_as_(im2)
            self._noise1.normal_(std=stddev)
            self._noise2.normal_(std=stddev)
            im1 += self._noise1
            im2 += self._noise2
            im1.clamp_(0.0, 1.0)
            im2.clamp_(0.0, 1.0)

        if self._crop is not None:
            im1, im2, flo_f, occ1 = self.random_crop(im1, im2, flo_f, occ1)

        occ1 = self.check_out_of_bound(flo_f, occ1, batch_size)

        example_dict["input1"] = im1
        example_dict["input2"] = im2
        example_dict["target1"] = flo_f
        example_dict["target_occ1"] = occ1

        return example_dict


class RandomAffineFlowOccKITTI(nn.Module):
    def __init__(self, args, addnoise=True, crop=None):
        super(RandomAffineFlowOccKITTI, self).__init__()
        self._args = args
        self._interp2 = Interp2(clamp=False)
        self._flow_interp2 = Interp2MaskBinary(clamp=False)
        self._meshgrid = Meshgrid()
        self._identity = _IdentityParams()
        self._random_mirror = RandomMirror(vertical=False)
        self._addnoise = addnoise
        self._crop = crop

        self.register_buffer("_noise1", torch.FloatTensor())
        self.register_buffer("_noise2", torch.FloatTensor())
        self.register_buffer("_xbounds", torch.FloatTensor([-1, -1, 1, 1]))
        self.register_buffer("_ybounds", torch.FloatTensor([-1, 1, -1, 1]))
        self.register_buffer("_x", torch.IntTensor(1))
        self.register_buffer("_y", torch.IntTensor(1))

    def inverse_transform_coords(self, width, height, thetas, offset_x=None, offset_y=None):
        xx, yy = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)

        xx = torch.unsqueeze(xx, dim=0).float()
        yy = torch.unsqueeze(yy, dim=0).float()

        if offset_x is not None:
            xx = xx + offset_x
        if offset_y is not None:
            yy = yy + offset_y

        a1 = thetas[:, 0].contiguous().view(-1, 1, 1)
        a2 = thetas[:, 1].contiguous().view(-1, 1, 1)
        a3 = thetas[:, 2].contiguous().view(-1, 1, 1)
        a4 = thetas[:, 3].contiguous().view(-1, 1, 1)
        a5 = thetas[:, 4].contiguous().view(-1, 1, 1)
        a6 = thetas[:, 5].contiguous().view(-1, 1, 1)

        xx, yy = normalize_coords(xx, yy, width=width, height=height)
        xq = a1 * xx + a2 * yy + a3
        yq = a4 * xx + a5 * yy + a6
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def transform_coords(self, width, height, thetas):
        xx1, yy1 = self._meshgrid(width=width, height=height, device=thetas.device, dtype=thetas.dtype)
        xx, yy = normalize_coords(xx1, yy1, width=width, height=height)

        def _unsqueeze12(u):
            return torch.unsqueeze(torch.unsqueeze(u, dim=1), dim=1)

        a1 = _unsqueeze12(thetas[:, 0])
        a2 = _unsqueeze12(thetas[:, 1])
        a3 = _unsqueeze12(thetas[:, 2])
        a4 = _unsqueeze12(thetas[:, 3])
        a5 = _unsqueeze12(thetas[:, 4])
        a6 = _unsqueeze12(thetas[:, 5])
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = xx - a3
        yhat = yy - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat

        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        return xq, yq

    def find_invalid(self, width, height, thetas):
        x = self._xbounds
        y = self._ybounds
        #
        a1 = torch.unsqueeze(thetas[:, 0], dim=1)
        a2 = torch.unsqueeze(thetas[:, 1], dim=1)
        a3 = torch.unsqueeze(thetas[:, 2], dim=1)
        a4 = torch.unsqueeze(thetas[:, 3], dim=1)
        a5 = torch.unsqueeze(thetas[:, 4], dim=1)
        a6 = torch.unsqueeze(thetas[:, 5], dim=1)
        #
        z = a1 * a5 - a2 * a4
        b1 = a5 / z
        b2 = - a2 / z
        b4 = - a4 / z
        b5 = a1 / z
        #
        xhat = x - a3
        yhat = y - a6
        xq = b1 * xhat + b2 * yhat
        yq = b4 * xhat + b5 * yhat
        xq, yq = denormalize_coords(xq, yq, width=width, height=height)
        #
        invalid = (
                      (xq < 0) | (yq < 0) | (xq >= width) | (yq >= height)
                  ).sum(dim=1, keepdim=True) > 0

        return invalid

    def apply_random_transforms_to_params(self,
                                          theta0,
                                          max_translate,
                                          min_zoom, max_zoom,
                                          min_squeeze, max_squeeze,
                                          min_rotate, max_rotate,
                                          validate_size=None):
        max_translate *= 0.5
        batch_size = theta0.size(0)
        height, width = validate_size

        # collect valid params here
        thetas = torch.zeros_like(theta0)

        zoom = theta0.new(batch_size, 1).zero_()
        squeeze = torch.zeros_like(zoom)
        tx = torch.zeros_like(zoom)
        ty = torch.zeros_like(zoom)
        phi = torch.zeros_like(zoom)
        invalid = torch.ones_like(zoom).byte()

        while invalid.sum() > 0:
            # random sampling
            zoom.uniform_(min_zoom, max_zoom)
            squeeze.uniform_(min_squeeze, max_squeeze)
            tx.uniform_(-max_translate, max_translate)
            ty.uniform_(-max_translate, max_translate)
            phi.uniform_(min_rotate, max_rotate)

            # construct affine parameters
            sx = zoom * squeeze
            sy = zoom / squeeze
            sin_phi = torch.sin(phi)
            cos_phi = torch.cos(phi)
            b1 = cos_phi * sx
            b2 = sin_phi * sy
            b3 = tx
            b4 = - sin_phi * sx
            b5 = cos_phi * sy
            b6 = ty

            theta_transform = torch.cat([b1, b2, b3, b4, b5, b6], dim=1)
            theta_try = apply_transform_to_params(theta0, theta_transform)
            thetas = invalid.float() * theta_try + (1 - invalid.float()) * thetas

            # compute new invalid ones
            invalid = self.find_invalid(width=width, height=height, thetas=thetas)

        # here we should have good thetas within borders
        return thetas

    def transform_image(self, images, thetas):
        batch_size, channels, height, width = images.size()
        xq, yq = self.transform_coords(width=width, height=height, thetas=thetas)
        transformed = self._interp2(images, xq, yq)
        return transformed

    def transform_flow(self, flow, theta1, theta2, valid_mask):
        batch_size, channels, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]

        # inverse transform coords
        x0, y0 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta1)

        x1, y1 = self.inverse_transform_coords(
            width=width, height=height, thetas=theta2, offset_x=u, offset_y=v)

        # subtract and create new flow
        u = x1 - x0
        v = y1 - y0
        new_flow = torch.stack([u, v], dim=1)

        # transform coords
        xq, yq = self.transform_coords(width=width, height=height, thetas=theta1)

        # interp2
        # transformed = self._interp2(new_flow, xq, yq)
        transformed, valid_mask = self._flow_interp2(new_flow, xq, yq, valid_mask)
        return transformed, valid_mask

    def check_out_of_bound(self, flow, occ, batch_size):
        _, _, height, width = flow.size()
        u = flow[:, 0, :, :]
        v = flow[:, 1, :, :]
        xx, yy = self._meshgrid(width=width, height=height, device=flow.device, dtype=flow.dtype)
        xx = torch.unsqueeze(xx, dim=0).float()
        yy = torch.unsqueeze(yy, dim=0).float()
        xx = xx.expand(batch_size, -1, -1) + u
        yy = yy.expand(batch_size, -1, -1) + v

        out_of_bound = ((xx < 0) | (yy < 0) | (xx >= width) | (yy >= height)).float().unsqueeze(1)
        occ = torch.clamp(out_of_bound + occ, 0, 1)

        return occ

    def random_crop(self, im1, im2, flo_f, valid_mask):

        _, _, height, width = im1.size()
        crop_height, crop_width = self._crop

        # get starting positions
        self._x.random_(0, width - crop_width + 1)
        self._y.random_(0, height - crop_height + 1)
        str_x = int(self._x)
        str_y = int(self._y)
        end_x = int(self._x + crop_width)
        end_y = int(self._y + crop_height)

        im1 = im1[:, :, str_y:end_y, str_x:end_x]
        im2 = im2[:, :, str_y:end_y, str_x:end_x]
        flo_f = flo_f[:, :, str_y:end_y, str_x:end_x]
        valid_mask = valid_mask[:, :, str_y:end_y, str_x:end_x]

        return im1, im2, flo_f, valid_mask

    def forward(self, example_dict):
        im1 = example_dict["input1"]
        im2 = example_dict["input2"]
        flo_f = example_dict["target1"]
        valid_mask = example_dict["input_valid"]

        batch_size = im1.size(0)
        height = im1.size(2)
        width = im1.size(3)

        # identity = no transform
        theta0 = self._identity(batch_size)

        # # global transform
        theta1 = self.apply_random_transforms_to_params(
            theta0,
            max_translate=0.04,
            min_zoom=0.98, max_zoom=1.02,
            min_squeeze=1.0, max_squeeze=1.0,
            min_rotate=-0.01, max_rotate=0.01,
            validate_size=[height, width])

        # relative transform
        theta2 = self.apply_random_transforms_to_params(
            theta1,
            max_translate=0.005,
            min_zoom=0.99, max_zoom=1.01,
            min_squeeze=1.0, max_squeeze=1.0,
            min_rotate=-0.01, max_rotate=0.01,
            validate_size=[height, width])

        # random flip images
        theta1, theta2 = self._random_mirror(theta1, theta2)

        im1 = self.transform_image(im1, theta1)
        im2 = self.transform_image(im2, theta2)
        flo_f, valid_mask = self.transform_flow(flo_f, theta1, theta2, valid_mask)


        if self._addnoise:
            stddev = np.random.uniform(0.0, 0.04)
            self._noise1.resize_as_(im1)
            self._noise2.resize_as_(im2)
            self._noise1.normal_(std=stddev)
            self._noise2.normal_(std=stddev)
            im1 += self._noise1
            im2 += self._noise2
            im1.clamp_(0.0, 1.0)
            im2.clamp_(0.0, 1.0)

        if self._crop is not None:
            im1, im2, flo_f, valid_mask = self.random_crop(im1, im2, flo_f, valid_mask)

        example_dict["input1"] = im1
        example_dict["input2"] = im2
        example_dict["target1"] = flo_f
        example_dict["input_valid"] = valid_mask

        return example_dict


================================================
FILE: commandline.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import argparse
import colorama
import inspect
import os
import sys
import torch

import datasets
import losses
import models
import augmentations
import tools
import logger
import logging
import optim


def _get_type_from_arg(arg):
    if isinstance(arg, bool):
        return tools.str2bool
    else:
        return type(arg)


def _add_arguments_for_module(parser,
                              module,
                              name,
                              default_class,
                              add_class_argument=True,   # whether to add class choice as argument
                              include_classes="*",
                              exclude_classes=[],
                              exclude_params=["self","args"],
                              param_defaults={},          # allows to overwrite any default param
                              forced_default_types={},    # allows to set types for known arguments
                              unknown_default_types={}):  # allows to set types for unknown arguments

    # -------------------------------------------------------------------------
    # Determine possible choices from class names in module, possibly apply include/exclude filters
    # -------------------------------------------------------------------------
    module_dict = tools.module_classes_to_dict(
        module, include_classes=include_classes, exclude_classes=exclude_classes)

    # -------------------------------------------------------------------------
    # Parse known arguments to determine choice for argument name
    # -------------------------------------------------------------------------
    if add_class_argument:
        parser.add_argument(
            "--%s" % name, type=str, default=default_class, choices=module_dict.keys())
        known_args = parser.parse_known_args(sys.argv[1:])[0]
    else:
        # build a temporary parser, and do not add the class as argument
        tmp_parser = argparse.ArgumentParser()
        tmp_parser.add_argument(
            "--%s" % name, type=str, default=default_class, choices=module_dict.keys())
        known_args = tmp_parser.parse_known_args(sys.argv[1:])[0]

    class_name = vars(known_args)[name]

    # -------------------------------------------------------------------------
    # If class is None, there is no point in trying to parse further arguments
    # -------------------------------------------------------------------------
    if class_name is None:
        return

    # -------------------------------------------------------------------------
    # Get constructor of that argument choice
    # -------------------------------------------------------------------------
    class_constructor = module_dict[class_name]

    # -------------------------------------------------------------------------
    # Determine constructor argument names and defaults
    # -------------------------------------------------------------------------
    try:
        argspec = inspect.getargspec(class_constructor.__init__)
        argspec_defaults = argspec.defaults if argspec.defaults is not None else []
        full_args = argspec.args
        default_args_dict = dict(zip(argspec.args[-len(argspec_defaults):], argspec_defaults))
    except TypeError:
        print(argspec)
        print(argspec.defaults)
        raise ValueError("unknown_default_types should be adjusted for module: '%s.py'" % name)

    # -------------------------------------------------------------------------
    # Add sub_arguments
    # -------------------------------------------------------------------------
    for argname in full_args:

        # ---------------------------------------------------------------------
        # Skip
        # ---------------------------------------------------------------------
        if argname in exclude_params:
            continue

        # ---------------------------------------------------------------------
        # Sub argument name
        # ---------------------------------------------------------------------
        sub_arg_name = "%s_%s" % (name, argname)

        # ---------------------------------------------------------------------
        # If a default argument is given, take that one
        # ---------------------------------------------------------------------
        if argname in param_defaults.keys():
            parser.add_argument(
                "--%s" % sub_arg_name,
                type=_get_type_from_arg(param_defaults[argname]),
                default=param_defaults[argname])

        # ---------------------------------------------------------------------
        # If a default parameter can be inferred from the module, pick that one
        # ---------------------------------------------------------------------
        elif argname in default_args_dict.keys():

            # -----------------------------------------------------------------
            # Check for forced default types
            # -----------------------------------------------------------------
            if argname in forced_default_types.keys():
                argtype = forced_default_types[argname]
            else:
                argtype = _get_type_from_arg(default_args_dict[argname])
            parser.add_argument(
                "--%s" % sub_arg_name, type=argtype, default=default_args_dict[argname])

        # ---------------------------------------------------------------------
        # Take from the unkowns list
        # ---------------------------------------------------------------------
        elif argname in unknown_default_types.keys():
            parser.add_argument("--%s" % sub_arg_name, type=unknown_default_types[argname])

        else:
            raise ValueError(
                "Do not know how to handle argument '%s' for class '%s'" % (argname, name))


def _add_special_arguments(parser):
    # -------------------------------------------------------------------------
    # Known arguments so far
    # -------------------------------------------------------------------------
    known_args = vars(parser.parse_known_args(sys.argv[1:])[0])

    # -------------------------------------------------------------------------
    # Add special arguments for training
    # -------------------------------------------------------------------------
    training_loss = known_args["training_loss"]
    if training_loss is not None:
        parser.add_argument("--training_key", type=str, default="total_loss")

    # -------------------------------------------------------------------------
    # Add special arguments for validation
    # -------------------------------------------------------------------------
    validation_loss = known_args["validation_loss"]
    if validation_loss is not None:
        parser.add_argument("--validation_key", type=str, default="total_loss")
        parser.add_argument("--validation_key_minimize", type=tools.str2bool, default=True)

    # -------------------------------------------------------------------------
    # Add special arguments for checkpoints
    # -------------------------------------------------------------------------
    checkpoint = known_args["checkpoint"]
    if checkpoint is not None:
        parser.add_argument(
            "--checkpoint_mode", type=str, default="resume_from_latest",
            choices=["resume_from_latest", "resume_from_best"])

        parser.add_argument(
            "--checkpoint_include_params", type=tools.str2list, default="[*]")
        parser.add_argument(
            "--checkpoint_exclude_params", type=tools.str2list, default="[]")

    # -------------------------------------------------------------------------
    # Add special arguments for optimizer groups
    # -------------------------------------------------------------------------
    parser.add_argument("--optimizer_group", action="append", type=tools.str2dict, default=None)


def _parse_arguments():

    # -------------------------------------------------------------------------
    # Argument parser and shortcut function to add arguments
    # -------------------------------------------------------------------------
    parser = argparse.ArgumentParser()
    add = parser.add_argument

    # -------------------------------------------------------------------------
    # Standard arguments
    # -------------------------------------------------------------------------
    add("--batch_size", type=int, default=1)
    add("--batch_size_val", type=int, default=1)
    add("--checkpoint", type=tools.str2str_or_none, default=None)
    add("--cuda", type=tools.str2bool, default=True)
    add("--evaluation", type=tools.str2bool, default=False)
    add("--name", default="run", type=str)
    add("--num_workers", type=int, default=4)
    add("--save", "-s", default="/tmp/work", type=str)
    add("--seed", type=int, default=1)
    add("--start_epoch", type=int, default=1)
    add("--total_epochs", type=int, default=10)
    add("--save_result_path_name", default="", type=str)
    add("--save_result_img", type=tools.str2bool, default=False)
    add("--save_result_occ", type=tools.str2bool, default=False)
    add("--save_result_flo", type=tools.str2bool, default=False)
    add("--save_result_png", type=tools.str2bool, default=False)
    add("--save_result_bidirection", type=tools.str2bool, default=False)
    add("--num_iters", type=int, default=1)

    # -------------------------------------------------------------------------
    # Arguments inferred from losses
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        losses,
        name="training_loss",
        default_class=None,
        exclude_classes=["_*", "Variable"],
        exclude_params=["self","args"])

    _add_arguments_for_module(
        parser,
        losses,
        name="validation_loss",
        default_class=None,
        exclude_classes=["_*", "Variable"],
        exclude_params=["self","args"])

    # -------------------------------------------------------------------------
    # Arguments inferred from models
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        models,
        name="model",
        default_class="FlowNet1S",
        exclude_classes=["_*", "Variable"],
        exclude_params=["self","args"])

    # -------------------------------------------------------------------------
    # Arguments inferred from augmentations for training
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        augmentations,
        name="training_augmentation",
        default_class=None,
        exclude_classes=["_*"],
        exclude_params=["self","args"],
        forced_default_types={"crop": tools.str2intlist})

    # -------------------------------------------------------------------------
    # Arguments inferred from augmentations for validation
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        augmentations,
        name="validation_augmentation",
        default_class=None,
        exclude_classes=["_*"],
        exclude_params=["self","args"])

    # -------------------------------------------------------------------------
    # Arguments inferred from datasets for training
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        datasets,
        name="training_dataset",
        default_class=None,
        exclude_params=["self", "args", "is_cropped"],
        exclude_classes=["_*"],
        unknown_default_types={"root": str})

    # -------------------------------------------------------------------------
    # Arguments inferred from datasets for validation
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        datasets,
        name="validation_dataset",
        default_class=None,
        exclude_params=["self", "args", "is_cropped"],
        exclude_classes=["_*"],
        unknown_default_types={"root": str})

    # -------------------------------------------------------------------------
    # Arguments inferred from PyTorch optimizers
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        optim,
        name="optimizer",
        default_class="Adam",
        exclude_classes=["_*","Optimizer", "constructor"],
        exclude_params=["self", "args", "params"],
        forced_default_types={"lr": float,
                              "momentum": float,
                              "dampening": float,
                              "weight_decay": float,
                              "nesterov": tools.str2bool})

    # -------------------------------------------------------------------------
    # Arguments inferred from PyTorch lr schedulers
    # -------------------------------------------------------------------------
    _add_arguments_for_module(
        parser,
        torch.optim.lr_scheduler,
        name="lr_scheduler",
        default_class=None,
        exclude_classes=["_*","Optimizer"],
        exclude_params=["self", "args", "optimizer"],
        unknown_default_types={"T_max": int,
                               "lr_lambda": str,
                               "step_size": int,
                               "milestones": tools.str2intlist,
                               "gamma": float})

    # -------------------------------------------------------------------------
    # Special arguments
    # -------------------------------------------------------------------------
    _add_special_arguments(parser)

    # -------------------------------------------------------------------------
    # Parse arguments
    # -------------------------------------------------------------------------
    args = parser.parse_args()

    # -------------------------------------------------------------------------
    # Parse default arguments from a dummy commandline not specifying any args
    # -------------------------------------------------------------------------
    defaults = vars(parser.parse_known_args(['--dummy'])[0])

    # -------------------------------------------------------------------------
    # Consistency checks
    # -------------------------------------------------------------------------
    args.cuda = args.cuda and torch.cuda.is_available()

    return args, defaults


def postprocess_args(args):

    # ----------------------------------------------------------------------------
    # Get appropriate class constructors from modules
    # ----------------------------------------------------------------------------
    args.model_class = tools.module_classes_to_dict(models)[args.model]

    if args.optimizer is not None:
        optimizer_classes = tools.module_classes_to_dict(optim)
        args.optimizer_class = optimizer_classes[args.optimizer]

    if args.training_loss is not None:
        loss_classes = tools.module_classes_to_dict(losses)
        args.training_loss_class = loss_classes[args.training_loss]

    if args.validation_loss is not None:
        loss_classes = tools.module_classes_to_dict(losses)
        args.validation_loss_class = loss_classes[args.validation_loss]

    if args.lr_scheduler is not None:
        scheduler_classes = tools.module_classes_to_dict(torch.optim.lr_scheduler)
        args.lr_scheduler_class = scheduler_classes[args.lr_scheduler]

    if args.training_dataset is not None:
        dataset_classes = tools.module_classes_to_dict(datasets)
        args.training_dataset_class = dataset_classes[args.training_dataset]

    if args.validation_dataset is not None:
        dataset_classes = tools.module_classes_to_dict(datasets)
        args.validation_dataset_class = dataset_classes[args.validation_dataset]

    if args.training_augmentation is not None:
        augmentation_classes = tools.module_classes_to_dict(augmentations)
        args.training_augmentation_class = augmentation_classes[args.training_augmentation]

    if args.validation_augmentation is not None:
        augmentation_classes = tools.module_classes_to_dict(augmentations)
        args.validation_augmentation_class = augmentation_classes[args.validation_augmentation]

    return args


def setup_logging_and_parse_arguments(blocktitle):
    # ----------------------------------------------------------------------------
    # Get parse commandline and default arguments
    # ----------------------------------------------------------------------------
    args, defaults = _parse_arguments()

    # ----------------------------------------------------------------------------
    # Setup logbook before everything else
    # ----------------------------------------------------------------------------
    logger.configure_logging(os.path.join(args.save, 'logbook.txt'))

    # ----------------------------------------------------------------------------
    # Write arguments to file, as txt
    # ----------------------------------------------------------------------------
    tools.write_dictionary_to_file(
        sorted(vars(args).items()),
        filename=os.path.join(args.save, 'args.txt'))

    # ----------------------------------------------------------------------------
    # Log arguments
    # ----------------------------------------------------------------------------
    with logger.LoggingBlock(blocktitle, emph=True):
        for argument, value in sorted(vars(args).items()):
            reset = colorama.Style.RESET_ALL
            color = reset if value == defaults[argument] else colorama.Fore.CYAN
            logging.info('{}{}: {}{}'.format(color, argument, value, reset))

    # ----------------------------------------------------------------------------
    # Postprocess
    # ----------------------------------------------------------------------------
    args = postprocess_args(args)

    return args


================================================
FILE: configuration.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import os
import torch
from torch import nn
import numpy as np
from torch.utils.data import DataLoader
import logger
import tools
import logging
import shutil
import random
import fnmatch

# ---------------------------------------------------
# Class that contains both the network model and loss
# ---------------------------------------------------
class ModelAndLoss(nn.Module):
    def __init__(self, args, model, training_loss, evaluation_loss=None):
        super(ModelAndLoss, self).__init__()
        self._model = model
        self._training_loss = training_loss
        self._evaluation_loss = evaluation_loss

    @property
    def training_loss(self):
        return self._training_loss

    @property
    def evaluation_loss(self):
        return self._evaluation_loss

    @property
    def model(self):
        return self._model

    def num_parameters(self):
        return sum([p.data.nelement() if p.requires_grad else 0 for p in self.parameters()])

    # -------------------------------------------------------------
    # Note: We merge inputs and targets into a single dictionary !
    # -------------------------------------------------------------
    def forward(self, example_dict):
        # -------------------------------------
        # Run forward pass
        # -------------------------------------
        output_dict = self._model(example_dict)

        # -------------------------------------
        # Compute losses
        # -------------------------------------
        if self.training:
            loss_dict = self._training_loss(output_dict, example_dict)
        else:
            loss_dict = self._evaluation_loss(output_dict, example_dict)

        # -------------------------------------
        # Return losses and outputs
        # -------------------------------------
        return loss_dict, output_dict


def configure_runtime_augmentations(args):
    with logger.LoggingBlock("Runtime Augmentations", emph=True):

        training_augmentation = None
        validation_augmentation = None

        # ----------------------------------------------------
        # Training Augmentation
        # ----------------------------------------------------
        if args.training_augmentation is not None:
            kwargs = tools.kwargs_from_args(args, "training_augmentation")
            logging.info("training_augmentation: %s" % args.training_augmentation)
            for param, default in sorted(kwargs.items()):
                logging.info("  %s: %s" % (param, default))
            kwargs["args"] = args
            training_augmentation = tools.instance_from_kwargs(
                args.training_augmentation_class, kwargs)
            if args.cuda:
                training_augmentation = training_augmentation.cuda()

        else:
            logging.info("training_augmentation: None")

        # ----------------------------------------------------
        # Validation Augmentation
        # ----------------------------------------------------
        if args.validation_augmentation is not None:
            kwargs = tools.kwargs_from_args(args, "validation_augmentation")
            logging.info("validation_augmentation: %s" % args.validation_augmentation)
            for param, default in sorted(kwargs.items()):
                logging.info("  %s: %s" % (param, default))
            kwargs["args"] = args
            validation_augmentation = tools.instance_from_kwargs(
                args.validation_augmentation_class, kwargs)
            if args.cuda:
                validation_augmentation = validation_augmentation.cuda()

        else:
            logging.info("validation_augmentation: None")

    return training_augmentation, validation_augmentation


def configure_model_and_loss(args):

    # ----------------------------------------------------
    # Dynamically load model and loss class with parameters
    # passed in via "--model_[param]=[value]" or "--loss_[param]=[value]" arguments
    # ----------------------------------------------------
    with logger.LoggingBlock("Model and Loss", emph=True):

        # ----------------------------------------------------
        # Model
        # ----------------------------------------------------
        kwargs = tools.kwargs_from_args(args, "model")
        kwargs["args"] = args
        model = tools.instance_from_kwargs(args.model_class, kwargs)

        # ----------------------------------------------------
        # Training loss
        # ----------------------------------------------------
        training_loss = None
        if args.training_loss is not None:
            kwargs = tools.kwargs_from_args(args, "training_loss")
            kwargs["args"] = args
            training_loss = tools.instance_from_kwargs(args.training_loss_class, kwargs)

        # ----------------------------------------------------
        # Validation loss
        # ----------------------------------------------------
        validation_loss = None
        if args.validation_loss is not None:
            kwargs = tools.kwargs_from_args(args, "validation_loss")
            kwargs["args"] = args
            validation_loss = tools.instance_from_kwargs(args.validation_loss_class, kwargs)

        # ----------------------------------------------------
        # Model and loss
        # ----------------------------------------------------
        model_and_loss = ModelAndLoss(args, model, training_loss, validation_loss)

        # -----------------------------------------------------------
        # If Cuda, transfer model to Cuda and wrap with DataParallel.
        # -----------------------------------------------------------
        if args.cuda:
            model_and_loss = model_and_loss.cuda()

        # ---------------------------------------------------------------
        # Report some network statistics
        # ---------------------------------------------------------------
        logging.info("Batch Size: %i" % args.batch_size)
        logging.info("GPGPU: Cuda") if args.cuda else logging.info("GPGPU: off")
        logging.info("Network: %s" % args.model)
        logging.info("Number of parameters: %i" % tools.x2module(model_and_loss).num_parameters())
        if training_loss is not None:
            logging.info("Training Key: %s" % args.training_key)
            logging.info("Training Loss: %s" % args.training_loss)
        if validation_loss is not None:
            logging.info("Validation Key: %s" % args.validation_key)
            logging.info("Validation Loss: %s" % args.validation_loss)

    return model_and_loss


def configure_random_seed(args):
    with logger.LoggingBlock("Random Seeds", emph=True):
        # python
        seed = args.seed
        random.seed(seed)
        logging.info("Python seed: %i" % seed)
        # numpy
        seed += 1
        np.random.seed(seed)
        logging.info("Numpy seed: %i" % seed)
        # torch
        seed += 1
        torch.manual_seed(seed)
        logging.info("Torch CPU seed: %i" % seed)
        # torch cuda
        seed += 1
        torch.cuda.manual_seed(seed)
        logging.info("Torch CUDA seed: %i" % seed)


# --------------------------------------------------------------------------
# Checkpoint loader/saver.
# --------------------------------------------------------------------------
class CheckpointSaver:
    def __init__(self,
                 prefix="checkpoint",
                 latest_postfix="_latest",
                 best_postfix="_best",
                 model_key="state_dict",
                 extension=".ckpt"):

        self._prefix = prefix
        self._model_key = model_key
        self._latest_postfix = latest_postfix
        self._best_postfix = best_postfix
        self._extension = extension

    # the purpose of rewriting the loading function is we sometimes want to
    # initialize parameters in modules without knowing the dimensions at runtime
    #
    # This function here will resize these parameters to whatever size required.
    
    def _load_state_dict_into_module(self, state_dict, module, strict=True):
        own_state = module.state_dict()

        for name, param in state_dict.items():
            if name in own_state:
                if isinstance(param, nn.Parameter):
                    # backwards compatibility for serialized parameters
                    param = param.data
                try:
                    own_state[name].resize_as_(param)
                    own_state[name].copy_(param)
                except Exception:
                    raise RuntimeError('While copying the parameter named {}, '
                                       'whose dimensions in the model are {} and '
                                       'whose dimensions in the checkpoint are {}.'
                                       .format(name, own_state[name].size(), param.size()))
            elif strict:
                raise KeyError('unexpected key "{}" in state_dict'
                               .format(name))
        if strict:
            missing = set(own_state.keys()) - set(state_dict.keys())
            if len(missing) > 0:
                raise KeyError('missing keys in state_dict: "{}"'.format(missing))

    def restore(self, filename, model_and_loss, include_params="*", exclude_params=()):
        # -----------------------------------------------------------------------------------------
        # Make sure file exists
        # -----------------------------------------------------------------------------------------
        if not os.path.isfile(filename):
            logging.info("Could not find checkpoint file '%s'!" % filename)
            quit()

        # -----------------------------------------------------------------------------------------
        # Load checkpoint from file including the state_dict
        # -----------------------------------------------------------------------------------------
        checkpoint_with_state = torch.load(filename)

        # -----------------------------------------------------------------------------------------
        # Load filtered state dictionary
        # -----------------------------------------------------------------------------------------
        state_dict = checkpoint_with_state[self._model_key]
        restore_keys = tools.filter_list_of_strings(
            state_dict.keys(),
            include=include_params,
            exclude=exclude_params)
        state_dict = {key: value for key, value in state_dict.items() if key in restore_keys}
        self._load_state_dict_into_module(state_dict, model_and_loss)
        # logging.info("  Restore keys:")
        # for key in restore_keys:
        #     logging.info("    %s" % key)

        # -----------------------------------------------------------------------------------------
        # Get checkpoint statistics without the state dict
        # -----------------------------------------------------------------------------------------
        checkpoint_stats = {
            key: value for key, value in checkpoint_with_state.items() if key != self._model_key
        }

        return checkpoint_stats, filename

    def restore_latest(self, directory, model_and_loss, include_params="*", exclude_params=()):
        latest_checkpoint_filename = os.path.join(
            directory, self._prefix + self._latest_postfix + self._extension)
        return self.restore(latest_checkpoint_filename, model_and_loss, include_params, exclude_params)

    def restore_best(self, directory, model_and_loss, include_params="*", exclude_params=()):
        best_checkpoint_filename = os.path.join(
            directory, self._prefix + self._best_postfix + self._extension)
        return self.restore(best_checkpoint_filename, model_and_loss, include_params, exclude_params)

    def save_latest(self, directory, model_and_loss, stats_dict, store_as_best=False):
        # -----------------------------------------------------------------------------------------
        # Make sure directory exists
        # -----------------------------------------------------------------------------------------
        tools.ensure_dir(directory)

        # -----------------------------------------------------------------------------------------
        # Save
        # -----------------------------------------------------------------------------------------
        save_dict = dict(stats_dict)
        save_dict[self._model_key] = model_and_loss.state_dict()

        latest_checkpoint_filename = os.path.join(
            directory, self._prefix + self._latest_postfix + self._extension)

        latest_statistics_filename = os.path.join(
            directory, self._prefix + self._latest_postfix + ".json")

        torch.save(save_dict, latest_checkpoint_filename)
        tools.write_json(data_dict=stats_dict, filename=latest_statistics_filename)

        # -----------------------------------------------------------------------------------------
        # Possibly store as best
        # -----------------------------------------------------------------------------------------
        if store_as_best:
            best_checkpoint_filename = os.path.join(
                directory, self._prefix + self._best_postfix + self._extension)

            best_statistics_filename = os.path.join(
                directory, self._prefix + self._best_postfix + ".json")

            logging.info("Saved checkpoint as best model..")
            shutil.copyfile(latest_checkpoint_filename, best_checkpoint_filename)
            shutil.copyfile(latest_statistics_filename, best_statistics_filename)


def configure_checkpoint_saver(args, model_and_loss):
    with logger.LoggingBlock("Checkpoint", emph=True):
        checkpoint_saver = CheckpointSaver()
        checkpoint_stats = None

        if args.checkpoint is None:
            logging.info("No checkpoint given.")
            logging.info("Starting from scratch with random initialization.")

        elif os.path.isfile(args.checkpoint):
            checkpoint_stats, filename = checkpoint_saver.restore(
                filename=args.checkpoint,
                model_and_loss=model_and_loss,
                include_params=args.checkpoint_include_params,
                exclude_params=args.checkpoint_exclude_params)

        elif os.path.isdir(args.checkpoint):
            if args.checkpoint_mode in ["resume_from_best"]:
                logging.info("Loading best checkpoint in %s" % args.checkpoint)
                checkpoint_stats, filename = checkpoint_saver.restore_best(
                    directory=args.checkpoint,
                    model_and_loss=model_and_loss,
                    include_params=args.checkpoint_include_params,
                    exclude_params=args.checkpoint_exclude_params)

            elif args.checkpoint_mode in ["resume_from_latest"]:
                logging.info("Loading latest checkpoint in %s" % args.checkpoint)
                checkpoint_stats, filename = checkpoint_saver.restore_latest(
                    directory=args.checkpoint,
                    model_and_loss=model_and_loss,
                    include_params=args.checkpoint_include_params,
                    exclude_params=args.checkpoint_exclude_params)
            else:
                logging.info("Unknown checkpoint_restore '%s' given!" % args.checkpoint_restore)
                quit()
        else:
            logging.info("Could not find checkpoint file or directory '%s'" % args.checkpoint)
            quit()

    return checkpoint_saver, checkpoint_stats


# -------------------------------------------------------------------------------------------------
# Configure data loading
# -------------------------------------------------------------------------------------------------
def configure_data_loaders(args):
    with logger.LoggingBlock("Datasets", emph=True):

        def _sizes_to_str(value):
            if np.isscalar(value):
                return '[1L]'
            else:
                return ' '.join([str([d for d in value.size()])])

        def _log_statistics(dataset, prefix, name):
            with logger.LoggingBlock("%s Dataset: %s" % (prefix, name)):
                example_dict = dataset[0]  # get sizes from first dataset example
                for key, value in sorted(example_dict.items()):
                    if key in ["index", "basename"]:  # no need to display these
                        continue
                    if isinstance(value, str):
                        logging.info("{}: {}".format(key, value))
                    else:
                        logging.info("%s: %s" % (key, _sizes_to_str(value)))
                logging.info("num_examples: %i" % len(dataset))

        # -----------------------------------------------------------------------------------------
        # GPU parameters -- turning off pin_memory? for resolving the deadlock?
        # -----------------------------------------------------------------------------------------
        gpuargs = {"num_workers": args.num_workers, "pin_memory": False} if args.cuda else {}

        train_loader = None
        validation_loader = None
        inference_loader = None

        # -----------------------------------------------------------------------------------------
        # Training dataset
        # -----------------------------------------------------------------------------------------
        if args.training_dataset is not None:

            # ----------------------------------------------
            # Figure out training_dataset arguments
            # ----------------------------------------------
            kwargs = tools.kwargs_from_args(args, "training_dataset")
            kwargs["is_cropped"] = True
            kwargs["args"] = args

            # ----------------------------------------------
            # Create training dataset
            # ----------------------------------------------
            train_dataset = tools.instance_from_kwargs(args.training_dataset_class, kwargs)

            # ----------------------------------------------
            # Create training loader
            # ----------------------------------------------
            train_loader = DataLoader(
                train_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                drop_last=False,
                **gpuargs)

            _log_statistics(train_dataset, prefix="Training", name=args.training_dataset)

        # -----------------------------------------------------------------------------------------
        # Validation dataset
        # -----------------------------------------------------------------------------------------
        if args.validation_dataset is not None:

            # ----------------------------------------------
            # Figure out validation_dataset arguments
            # ----------------------------------------------
            kwargs = tools.kwargs_from_args(args, "validation_dataset")
            kwargs["is_cropped"] = True
            kwargs["args"] = args

            # ----------------------------------------------
            # Create validation dataset
            # ----------------------------------------------
            validation_dataset = tools.instance_from_kwargs(args.validation_dataset_class, kwargs)

            # ----------------------------------------------
            # Create validation loader
            # ----------------------------------------------
            validation_loader = DataLoader(
                validation_dataset,
                batch_size=args.batch_size_val,
                shuffle=False,
                drop_last=False,
                **gpuargs)

            _log_statistics(validation_dataset, prefix="Validation", name=args.validation_dataset)

    return train_loader, validation_loader, inference_loader


# ------------------------------------------------------------
# Generator for trainable parameters by pattern matching
# ------------------------------------------------------------
def _print_trainable_params(model_and_loss, match="*"):
    sum = 0
    for name, p in model_and_loss.named_parameters():
        if fnmatch.fnmatch(name, match):
            if p.requires_grad:
                logging.info(name)
                logging.info(str(p.numel()))
                print(name)
                print(p.numel())
                sum += p.numel()
    logging.info(str(sum))

def _generate_trainable_params(model_and_loss, match="*"):
    for name, p in model_and_loss.named_parameters():
        if fnmatch.fnmatch(name, match):
            if p.requires_grad:
                yield p


def _param_names_and_trainable_generator(model_and_loss, match="*"):
    names = []
    for name, p in model_and_loss.named_parameters():
        if fnmatch.fnmatch(name, match):
            if p.requires_grad:
                names.append(name)

    return names, _generate_trainable_params(model_and_loss, match=match)


# -------------------------------------------------------------------------------------------------
# Build optimizer:
# -------------------------------------------------------------------------------------------------
def configure_optimizer(args, model_and_loss):
    optimizer = None
    with logger.LoggingBlock("Optimizer", emph=True):
        if args.optimizer is not None:
            if model_and_loss.num_parameters() == 0:
                logging.info("No trainable parameters detected.")
                logging.info("Setting optimizer to None.")
            else:
                logging.info(args.optimizer)

                # -------------------------------------------
                # Figure out all optimizer arguments
                # -------------------------------------------
                all_kwargs = tools.kwargs_from_args(args, "optimizer")

                # -------------------------------------------
                # Get the split of param groups
                # -------------------------------------------
                kwargs_without_groups = {
                    key: value for key,value in all_kwargs.items() if key != "group"
                }
                param_groups = all_kwargs["group"]

                # ----------------------------------------------------------------------
                # Print arguments (without groups)
                # ----------------------------------------------------------------------
                for param, default in sorted(kwargs_without_groups.items()):
                    logging.info("%s: %s" % (param, default))

                # ----------------------------------------------------------------------
                # Construct actual optimizer params
                # ----------------------------------------------------------------------
                kwargs = dict(kwargs_without_groups)
                if param_groups is None:
                    # ---------------------------------------------------------
                    # Add all trainable parameters if there is no param groups
                    # ---------------------------------------------------------
                    all_trainable_parameters = _generate_trainable_params(model_and_loss)
                    kwargs["params"] = all_trainable_parameters
                else:
                    # -------------------------------------------
                    # Add list of parameter groups instead
                    # -------------------------------------------
                    trainable_parameter_groups = []
                    dnames, dparams = _param_names_and_trainable_generator(model_and_loss)
                    dnames = set(dnames)
                    dparams = set(list(dparams))
                    with logger.LoggingBlock("parameter_groups:"):
                        for group in param_groups:
                            #  log group settings
                            group_match = group["params"]
                            group_args = {
                                key: value for key, value in group.items() if key != "params"
                            }

                            with logger.LoggingBlock("%s: %s" % (group_match, group_args)):
                                # retrieve parameters by matching name
                                gnames, gparams = _param_names_and_trainable_generator(
                                    model_and_loss, match=group_match)
                                # log all names affected
                                for n in sorted(gnames):
                                    logging.info(n)
                                # set generator for group
                                group_args["params"] = gparams
                                # append parameter group
                                trainable_parameter_groups.append(group_args)
                                # update remaining trainable parameters
                                dnames -= set(gnames)
                                dparams -= set(list(gparams))

                        # append default parameter group
                        trainable_parameter_groups.append({"params": list(dparams)})
                        # and log its parameter names
                        with logger.LoggingBlock("default:"):
                            for dname in sorted(dnames):
                                logging.info(dname)

                    # set params in optimizer kwargs
                    kwargs["params"] = trainable_parameter_groups

                # -------------------------------------------
                # Create optimizer instance
                # -------------------------------------------
                optimizer = tools.instance_from_kwargs(args.optimizer_class, kwargs)

    return optimizer


# -------------------------------------------------------------------------------------------------
# Configure learning rate scheduler
# -------------------------------------------------------------------------------------------------
def configure_lr_scheduler(args, optimizer):
    lr_scheduler = None

    with logger.LoggingBlock("Learning Rate Scheduler", emph=True):
        logging.info("class: %s" % args.lr_scheduler)

        if args.lr_scheduler is not None:

            # ----------------------------------------------
            # Figure out lr_scheduler arguments
            # ----------------------------------------------
            kwargs = tools.kwargs_from_args(args, "lr_scheduler")

            # -------------------------------------------
            # Print arguments
            # -------------------------------------------
            for param, default in sorted(kwargs.items()):
                logging.info("%s: %s" % (param, default))

            # -------------------------------------------
            # Add optimizer
            # -------------------------------------------
            kwargs["optimizer"] = optimizer

            # -------------------------------------------
            # Create lr_scheduler instance
            # -------------------------------------------
            lr_scheduler = tools.instance_from_kwargs(args.lr_scheduler_class, kwargs)

    return lr_scheduler


================================================
FILE: datasets/__init__.py
================================================
from . import flyingchairs
from . import flyingchairsOcc
from . import sintel
from . import flyingThings3D
from . import kitti_combined
from . import sintel

## FlyingChairs
FlyingChairsTrain = flyingchairs.FlyingChairsTrain
FlyingChairsValid = flyingchairs.FlyingChairsValid
FlyingChairsFull = flyingchairs.FlyingChairsFull

## Our custom FlyingChairs + Occ
FlyingChairsOccTrain = flyingchairsOcc.FlyingChairsOccTrain
FlyingChairsOccValid = flyingchairsOcc.FlyingChairsOccValid
FlyingChairsOccFull = flyingchairsOcc.FlyingChairsOccFull


## FlyingThings3D_subset
FlyingThings3dFinalTrain = flyingThings3D.FlyingThings3dFinalTrain
FlyingThings3dFinalTest = flyingThings3D.FlyingThings3dFinalTest
FlyingThings3dCleanTrain = flyingThings3D.FlyingThings3dCleanTrain
FlyingThings3dCleanTest = flyingThings3D.FlyingThings3dCleanTest


## Sintel
SintelTestClean = sintel.SintelTestClean
SintelTestFinal = sintel.SintelTestFinal

SintelTrainingCombFull = sintel.SintelTrainingCombFull
SintelTrainingCombTrain = sintel.SintelTrainingCombTrain
SintelTrainingCombValid = sintel.SintelTrainingCombValid

SintelTrainingCleanFull = sintel.SintelTrainingCleanFull
SintelTrainingCleanTrain = sintel.SintelTrainingCleanTrain
SintelTrainingCleanValid = sintel.SintelTrainingCleanValid

SintelTrainingFinalFull = sintel.SintelTrainingFinalFull
SintelTrainingFinalTrain = sintel.SintelTrainingFinalTrain
SintelTrainingFinalValid = sintel.SintelTrainingFinalValid


## KITTI Optical Flow 2012 + 2015
KittiCombTrain = kitti_combined.KittiCombTrain
KittiCombVal = kitti_combined.KittiCombVal
KittiCombFull = kitti_combined.KittiCombFull

KittiComb2012Train = kitti_combined.KittiComb2012Train
KittiComb2012Val = kitti_combined.KittiComb2012Val
KittiComb2012Full = kitti_combined.KittiComb2012Full
KittiComb2012Test = kitti_combined.KittiComb2012Test

KittiComb2015Train = kitti_combined.KittiComb2015Train
KittiComb2015Val = kitti_combined.KittiComb2015Val
KittiComb2015Full = kitti_combined.KittiComb2015Full
KittiComb2015Test = kitti_combined.KittiComb2015Test

================================================
FILE: datasets/common.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import torch
import numpy as np
import skimage.io as io


def numpy2torch(array):
    assert(isinstance(array, np.ndarray))
    if array.ndim == 3:
        array = np.transpose(array, (2, 0, 1))
    else:
        array = np.expand_dims(array, axis=0)
    return torch.from_numpy(array.copy()).float()


def read_flo_as_float32(filename):
    with open(filename, 'rb') as file:
        magic = np.fromfile(file, np.float32, count=1)
        assert(202021.25 == magic), "Magic number incorrect. Invalid .flo file"
        w = np.fromfile(file, np.int32, count=1)[0]
        h = np.fromfile(file, np.int32, count=1)[0]
        data = np.fromfile(file, np.float32, count=2*h*w)
    data2D = np.resize(data, (h, w, 2))
    return data2D


def read_occ_image_as_float32(filename):
    occ = io.imread(filename).astype(np.float32) / np.float32(255.0)
    if occ.ndim == 3:
        occ = occ[:, :, 0]
    return occ


def read_image_as_float32(filename):
    return io.imread(filename).astype(np.float32) / np.float32(255.0)


def read_image_as_byte(filename):
    return io.imread(filename)


================================================
FILE: datasets/flyingThings3D.py
================================================
from __future__ import absolute_import, division, print_function

import os
import torch.utils.data as data
from glob import glob

from torchvision import transforms as vision_transforms

from . import transforms
from . import common

import numpy as np


def fillingInNaN(flow):
    h, w, c = flow.shape
    indices = np.argwhere(np.isnan(flow))
    neighbors = [[-1, 0], [1, 0], [0, -1], [0, 1]]
    for ii, idx in enumerate(indices):
        sum_sample = 0
        count = 0
        for jj in range(0, len(neighbors) - 1):
            hh = idx[0] + neighbors[jj][0]
            ww = idx[1] + neighbors[jj][1]
            if hh < 0 or hh >= h:
                continue
            if ww < 0 or ww >= w:
                continue
            sample_flow = flow[hh, ww, idx[2]]
            if np.isnan(sample_flow):
                continue
            sum_sample += sample_flow
            count += 1
        if count is 0:
            print('FATAL ERROR: no sample')
        flow[idx[0], idx[1], idx[2]] = sum_sample / count

    return flow


class FlyingThings3d(data.Dataset):
    def __init__(self,
                 args,
                 images_root,
                 flow_root,
                 occ_root,
                 photometric_augmentations=False):

        self._args = args
        if not os.path.isdir(images_root):
            raise ValueError("Image directory '%s' not found!")
        if flow_root is not None and not os.path.isdir(flow_root):
            raise ValueError("Flow directory '%s' not found!")
        if occ_root is not None and not os.path.isdir(occ_root):
            raise ValueError("Occ directory '%s' not found!")

        if flow_root is not None:
            flow_f_filenames = sorted(glob(os.path.join(flow_root, "into_future/*.flo")))
            flow_b_filenames = sorted(glob(os.path.join(flow_root, "into_past/*.flo")))

        if occ_root is not None:
            occ1_filenames = sorted(glob(os.path.join(occ_root, "into_future/*.png")))
            occ2_filenames = sorted(glob(os.path.join(occ_root, "into_past/*.png")))

        all_img_filenames = sorted(glob(os.path.join(images_root, "*.png")))

        self._image_list = []
        self._flow_list = [] if flow_root is not None else None
        self._occ_list = [] if occ_root is not None else None

        assert len(all_img_filenames) != 0
        assert len(flow_f_filenames) != 0
        assert len(flow_b_filenames) != 0
        assert len(occ1_filenames) != 0
        assert len(occ2_filenames) != 0

        ## path definition
        path_flow_f = os.path.join(flow_root, "into_future")
        path_flow_b = os.path.join(flow_root, "into_past")
        path_occ_f = os.path.join(occ_root, "into_future")
        path_occ_b = os.path.join(occ_root, "into_past")

        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------

        for ii in range(0, len(flow_f_filenames)):

            flo_f = flow_f_filenames[ii]

            idx_f = os.path.splitext(os.path.basename(flo_f))[0]
            idx_b = str(int(idx_f) + 1).zfill(7)

            flo_b = os.path.join(path_flow_b, idx_b + ".flo")

            im1 = os.path.join(images_root, idx_f + ".png")
            im2 = os.path.join(images_root, idx_b + ".png")
            occ1 = os.path.join(path_occ_f, idx_f + ".png")
            occ2 = os.path.join(path_occ_b, idx_b + ".png")

            if not os.path.isfile(flo_f) or not os.path.isfile(flo_b) or not os.path.isfile(im1) or not os.path.isfile(
                    im2) or not os.path.isfile(occ1) or not os.path.isfile(occ2):
                continue

            self._image_list += [[im1, im2]]
            self._flow_list += [[flo_f, flo_b]]
            self._occ_list += [[occ1, occ2]]

        self._size = len(self._image_list)

        assert len(self._image_list) == len(self._flow_list)
        assert len(self._occ_list) == len(self._flow_list)
        assert len(self._image_list) != 0

        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)

        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)

    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]
        flo_f_filename = self._flow_list[index][0]
        flo_b_filename = self._flow_list[index][1]
        occ1_filename = self._occ_list[index][0]
        occ2_filename = self._occ_list[index][1]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)
        flo_f_np0 = common.read_flo_as_float32(flo_f_filename)
        flo_b_np0 = common.read_flo_as_float32(flo_b_filename)
        occ1_np0 = common.read_occ_image_as_float32(occ1_filename)
        occ2_np0 = common.read_occ_image_as_float32(occ2_filename)

        # temp - check isnan
        if np.any(np.isnan(flo_f_np0)):
            flo_f_np0 = fillingInNaN(flo_f_np0)

        if np.any(np.isnan(flo_b_np0)):
            flo_b_np0 = fillingInNaN(flo_b_np0)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)

        # convert flow to FloatTensor
        flo_f = common.numpy2torch(flo_f_np0)
        flo_b = common.numpy2torch(flo_b_np0)

        # convert occ to FloatTensor
        occ1 = common.numpy2torch(occ1_np0)
        occ2 = common.numpy2torch(occ2_np0)

        # example filename
        basename = os.path.basename(im1_filename)[:5]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "target1": flo_f,
            "target2": flo_b,
            "target_occ1": occ1,
            "target_occ2": occ2,
            "index": index,
            "basename": basename
        }

        return example_dict

    def __len__(self):
        return self._size


class FlyingThings3dFinalTrain(FlyingThings3d):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True):
        images_root = os.path.join(root, "frames_finalpass")
        flow_root = os.path.join(root, "optical_flow")
        occ_root = os.path.join(root, "occlusion")
        super(FlyingThings3dFinalTrain, self).__init__(
            args,
            images_root=images_root,
            flow_root=flow_root,
            occ_root=occ_root,
            photometric_augmentations=photometric_augmentations)


class FlyingThings3dFinalTest(FlyingThings3d):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False):
        images_root = os.path.join(root, "frames_finalpass")
        flow_root = os.path.join(root, "optical_flow")
        occ_root = os.path.join(root, "occlusion")
        super(FlyingThings3dFinalTest, self).__init__(
            args,
            images_root=images_root,
            flow_root=flow_root,
            occ_root=occ_root,
            photometric_augmentations=photometric_augmentations)


class FlyingThings3dCleanTrain(FlyingThings3d):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True):
        images_root = os.path.join(root, "train", "image_clean", "left")
        flow_root = os.path.join(root, "train", "flow", "left")
        occ_root = os.path.join(root, "train", "flow_occlusions", "left")
        super(FlyingThings3dCleanTrain, self).__init__(
            args,
            images_root=images_root,
            flow_root=flow_root,
            occ_root=occ_root,
            photometric_augmentations=photometric_augmentations)


class FlyingThings3dCleanTest(FlyingThings3d):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False):
        images_root = os.path.join(root, "frames_cleanpass")
        flow_root = os.path.join(root, "optical_flow")
        occ_root = os.path.join(root, "occlusion")
        super(FlyingThings3dCleanTest, self).__init__(
            args,
            images_root=images_root,
            flow_root=flow_root,
            occ_root=occ_root,
            photometric_augmentations=photometric_augmentations)


================================================
FILE: datasets/flyingchairs.py
================================================
from __future__ import absolute_import, division, print_function

import os
import torch.utils.data as data
from glob import glob

from torchvision import transforms as vision_transforms

from . import transforms
from . import common


VALIDATE_INDICES = [
    5, 17, 42, 45, 58, 62, 96, 111, 117, 120, 121, 131, 132,
    152, 160, 248, 263, 264, 291, 293, 295, 299, 316, 320, 336,
    337, 343, 358, 399, 401, 429, 438, 468, 476, 494, 509, 528,
    531, 572, 581, 583, 588, 593, 681, 688, 696, 714, 767, 786,
    810, 825, 836, 841, 883, 917, 937, 942, 970, 974, 980, 1016,
    1043, 1064, 1118, 1121, 1133, 1153, 1155, 1158, 1159, 1173,
    1187, 1219, 1237, 1238, 1259, 1266, 1278, 1296, 1354, 1378,
    1387, 1494, 1508, 1518, 1574, 1601, 1614, 1668, 1673, 1699,
    1712, 1714, 1737, 1841, 1872, 1879, 1901, 1921, 1934, 1961,
    1967, 1978, 2018, 2030, 2039, 2043, 2061, 2113, 2204, 2216,
    2236, 2250, 2274, 2292, 2310, 2342, 2359, 2374, 2382, 2399,
    2415, 2419, 2483, 2502, 2504, 2576, 2589, 2590, 2622, 2624,
    2636, 2651, 2655, 2658, 2659, 2664, 2672, 2706, 2707, 2709,
    2725, 2732, 2761, 2827, 2864, 2866, 2905, 2922, 2929, 2966,
    2972, 2993, 3010, 3025, 3031, 3040, 3041, 3070, 3113, 3124,
    3129, 3137, 3141, 3157, 3183, 3206, 3219, 3247, 3253, 3272,
    3276, 3321, 3328, 3333, 3338, 3341, 3346, 3351, 3396, 3419,
    3430, 3433, 3448, 3455, 3463, 3503, 3526, 3529, 3537, 3555,
    3577, 3584, 3591, 3594, 3597, 3603, 3613, 3615, 3670, 3676,
    3678, 3697, 3723, 3728, 3734, 3745, 3750, 3752, 3779, 3782,
    3813, 3817, 3819, 3854, 3885, 3944, 3947, 3970, 3985, 4011,
    4022, 4071, 4075, 4132, 4158, 4167, 4190, 4194, 4207, 4246,
    4249, 4298, 4307, 4317, 4318, 4319, 4320, 4382, 4399, 4401,
    4407, 4416, 4423, 4484, 4491, 4493, 4517, 4525, 4538, 4578,
    4606, 4609, 4620, 4623, 4637, 4646, 4662, 4668, 4716, 4739,
    4747, 4770, 4774, 4776, 4785, 4800, 4845, 4863, 4891, 4904,
    4922, 4925, 4956, 4963, 4964, 4994, 5011, 5019, 5036, 5038,
    5041, 5055, 5118, 5122, 5130, 5162, 5164, 5178, 5196, 5227,
    5266, 5270, 5273, 5279, 5299, 5310, 5314, 5363, 5375, 5384,
    5393, 5414, 5417, 5433, 5448, 5494, 5505, 5509, 5525, 5566,
    5581, 5602, 5609, 5620, 5653, 5670, 5678, 5690, 5700, 5703,
    5724, 5752, 5765, 5803, 5811, 5860, 5881, 5895, 5912, 5915,
    5940, 5952, 5966, 5977, 5988, 6007, 6037, 6061, 6069, 6080,
    6111, 6127, 6146, 6161, 6166, 6168, 6178, 6182, 6190, 6220,
    6235, 6253, 6270, 6343, 6372, 6379, 6410, 6411, 6442, 6453,
    6481, 6498, 6500, 6509, 6532, 6541, 6543, 6560, 6576, 6580,
    6594, 6595, 6609, 6625, 6629, 6644, 6658, 6673, 6680, 6698,
    6699, 6702, 6705, 6741, 6759, 6785, 6792, 6794, 6809, 6810,
    6830, 6838, 6869, 6871, 6889, 6925, 6995, 7003, 7026, 7029,
    7080, 7082, 7097, 7102, 7116, 7165, 7200, 7232, 7271, 7282,
    7324, 7333, 7335, 7372, 7387, 7407, 7472, 7474, 7482, 7489,
    7499, 7516, 7533, 7536, 7566, 7620, 7654, 7691, 7704, 7722,
    7746, 7750, 7773, 7806, 7821, 7827, 7851, 7873, 7880, 7884,
    7904, 7912, 7948, 7964, 7965, 7984, 7989, 7992, 8035, 8050,
    8074, 8091, 8094, 8113, 8116, 8151, 8159, 8171, 8179, 8194,
    8195, 8239, 8263, 8290, 8295, 8312, 8367, 8374, 8387, 8407,
    8437, 8439, 8518, 8556, 8588, 8597, 8601, 8651, 8657, 8723,
    8759, 8763, 8785, 8802, 8813, 8826, 8854, 8856, 8866, 8918,
    8922, 8923, 8932, 8958, 8967, 9003, 9018, 9078, 9095, 9104,
    9112, 9129, 9147, 9170, 9171, 9197, 9200, 9249, 9253, 9270,
    9282, 9288, 9295, 9321, 9323, 9324, 9347, 9399, 9403, 9417,
    9426, 9427, 9439, 9468, 9486, 9496, 9511, 9516, 9518, 9529,
    9557, 9563, 9564, 9584, 9586, 9591, 9599, 9600, 9601, 9632,
    9654, 9667, 9678, 9696, 9716, 9723, 9740, 9820, 9824, 9825,
    9828, 9863, 9866, 9868, 9889, 9929, 9938, 9953, 9967, 10019,
    10020, 10025, 10059, 10111, 10118, 10125, 10174, 10194,
    10201, 10202, 10220, 10221, 10226, 10242, 10250, 10276,
    10295, 10302, 10305, 10327, 10351, 10360, 10369, 10393,
    10407, 10438, 10455, 10463, 10465, 10470, 10478, 10503,
    10508, 10509, 10809, 11080, 11331, 11607, 11610, 11864,
    12390, 12393, 12396, 12399, 12671, 12921, 12930, 13178,
    13453, 13717, 14499, 14517, 14775, 15297, 15556, 15834,
    15839, 16126, 16127, 16386, 16633, 16644, 16651, 17166,
    17169, 17958, 17959, 17962, 18224, 21176, 21180, 21190,
    21802, 21803, 21806, 22584, 22857, 22858, 22866]


class FlyingChairs(data.Dataset):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 dstype="train"):

        self._args = args

        # -------------------------------------------------------------
        # filenames for all input images and target flows
        # -------------------------------------------------------------
        image_filenames = sorted( glob( os.path.join(root, "*.ppm")) )
        flow_filenames = sorted( glob( os.path.join(root, "*.flo")) )
        assert (len(image_filenames)/2 == len(flow_filenames))
        num_flows = len(flow_filenames)

        # -------------------------------------------------------------
        # Remove invalid validation indices
        # -------------------------------------------------------------
        validate_indices = [x for x in VALIDATE_INDICES if x in range(num_flows)]

        # ----------------------------------------------------------
        # Construct list of indices for training/validation
        # ----------------------------------------------------------
        list_of_indices = None
        if dstype == "train":
            list_of_indices = [x for x in range(num_flows) if x not in validate_indices]
        elif dstype == "valid":
            list_of_indices = validate_indices
        elif dstype == "full":
            list_of_indices = range(num_flows)
        else:
            raise ValueError("FlyingChairs: dstype '%s' unknown!", dstype)


        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------
        self._image_list = []
        self._flow_list = []
        for i in list_of_indices:
            flo = flow_filenames[i]
            im1 = image_filenames[2*i]
            im2 = image_filenames[2*i + 1]
            self._image_list += [ [ im1, im2 ] ]
            self._flow_list += [ flo ]
        self._size = len(self._image_list)
        assert len(self._image_list) == len(self._flow_list)

        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)
        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)

    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]
        flo_filename = self._flow_list[index]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)
        flo_np0 = common.read_flo_as_float32(flo_filename)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)

        # convert flow to FloatTensor
        flo = common.numpy2torch(flo_np0)

        # target_occ: initialized by zero (not used)
        target_occ = common.numpy2torch(common.read_occ_image_as_float32(im1_filename)) * 0

        # example filename
        basename = os.path.basename(im1_filename)[:5]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "target1": flo,
            "target_occ1": target_occ,
            "index": index,
            "basename": basename
        }

        return example_dict

    def __len__(self):
        return self._size


class FlyingChairsTrain(FlyingChairs):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True):
        super(FlyingChairsTrain, self).__init__(
            args,
            root=root,
            photometric_augmentations=photometric_augmentations,
            dstype="train")


class FlyingChairsValid(FlyingChairs):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False):
        super(FlyingChairsValid, self).__init__(
            args,
            root=root,
            photometric_augmentations=photometric_augmentations,
            dstype="valid")


class FlyingChairsFull(FlyingChairs):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False):
        super(FlyingChairsFull, self).__init__(
            args,
            root=root,
            photometric_augmentations=photometric_augmentations,
            dstype="full")


================================================
FILE: datasets/flyingchairsOcc.py
================================================
from __future__ import absolute_import, division, print_function

import os
import torch.utils.data as data
from glob import glob

from torchvision import transforms as vision_transforms

from . import transforms
from . import common


VALIDATE_INDICES = [
    5, 17, 42, 45, 58, 62, 96, 111, 117, 120, 121, 131, 132,
    152, 160, 248, 263, 264, 291, 293, 295, 299, 316, 320, 336,
    337, 343, 358, 399, 401, 429, 438, 468, 476, 494, 509, 528,
    531, 572, 581, 583, 588, 593, 681, 688, 696, 714, 767, 786,
    810, 825, 836, 841, 883, 917, 937, 942, 970, 974, 980, 1016,
    1043, 1064, 1118, 1121, 1133, 1153, 1155, 1158, 1159, 1173,
    1187, 1219, 1237, 1238, 1259, 1266, 1278, 1296, 1354, 1378,
    1387, 1494, 1508, 1518, 1574, 1601, 1614, 1668, 1673, 1699,
    1712, 1714, 1737, 1841, 1872, 1879, 1901, 1921, 1934, 1961,
    1967, 1978, 2018, 2030, 2039, 2043, 2061, 2113, 2204, 2216,
    2236, 2250, 2274, 2292, 2310, 2342, 2359, 2374, 2382, 2399,
    2415, 2419, 2483, 2502, 2504, 2576, 2589, 2590, 2622, 2624,
    2636, 2651, 2655, 2658, 2659, 2664, 2672, 2706, 2707, 2709,
    2725, 2732, 2761, 2827, 2864, 2866, 2905, 2922, 2929, 2966,
    2972, 2993, 3010, 3025, 3031, 3040, 3041, 3070, 3113, 3124,
    3129, 3137, 3141, 3157, 3183, 3206, 3219, 3247, 3253, 3272,
    3276, 3321, 3328, 3333, 3338, 3341, 3346, 3351, 3396, 3419,
    3430, 3433, 3448, 3455, 3463, 3503, 3526, 3529, 3537, 3555,
    3577, 3584, 3591, 3594, 3597, 3603, 3613, 3615, 3670, 3676,
    3678, 3697, 3723, 3728, 3734, 3745, 3750, 3752, 3779, 3782,
    3813, 3817, 3819, 3854, 3885, 3944, 3947, 3970, 3985, 4011,
    4022, 4071, 4075, 4132, 4158, 4167, 4190, 4194, 4207, 4246,
    4249, 4298, 4307, 4317, 4318, 4319, 4320, 4382, 4399, 4401,
    4407, 4416, 4423, 4484, 4491, 4493, 4517, 4525, 4538, 4578,
    4606, 4609, 4620, 4623, 4637, 4646, 4662, 4668, 4716, 4739,
    4747, 4770, 4774, 4776, 4785, 4800, 4845, 4863, 4891, 4904,
    4922, 4925, 4956, 4963, 4964, 4994, 5011, 5019, 5036, 5038,
    5041, 5055, 5118, 5122, 5130, 5162, 5164, 5178, 5196, 5227,
    5266, 5270, 5273, 5279, 5299, 5310, 5314, 5363, 5375, 5384,
    5393, 5414, 5417, 5433, 5448, 5494, 5505, 5509, 5525, 5566,
    5581, 5602, 5609, 5620, 5653, 5670, 5678, 5690, 5700, 5703,
    5724, 5752, 5765, 5803, 5811, 5860, 5881, 5895, 5912, 5915,
    5940, 5952, 5966, 5977, 5988, 6007, 6037, 6061, 6069, 6080,
    6111, 6127, 6146, 6161, 6166, 6168, 6178, 6182, 6190, 6220,
    6235, 6253, 6270, 6343, 6372, 6379, 6410, 6411, 6442, 6453,
    6481, 6498, 6500, 6509, 6532, 6541, 6543, 6560, 6576, 6580,
    6594, 6595, 6609, 6625, 6629, 6644, 6658, 6673, 6680, 6698,
    6699, 6702, 6705, 6741, 6759, 6785, 6792, 6794, 6809, 6810,
    6830, 6838, 6869, 6871, 6889, 6925, 6995, 7003, 7026, 7029,
    7080, 7082, 7097, 7102, 7116, 7165, 7200, 7232, 7271, 7282,
    7324, 7333, 7335, 7372, 7387, 7407, 7472, 7474, 7482, 7489,
    7499, 7516, 7533, 7536, 7566, 7620, 7654, 7691, 7704, 7722,
    7746, 7750, 7773, 7806, 7821, 7827, 7851, 7873, 7880, 7884,
    7904, 7912, 7948, 7964, 7965, 7984, 7989, 7992, 8035, 8050,
    8074, 8091, 8094, 8113, 8116, 8151, 8159, 8171, 8179, 8194,
    8195, 8239, 8263, 8290, 8295, 8312, 8367, 8374, 8387, 8407,
    8437, 8439, 8518, 8556, 8588, 8597, 8601, 8651, 8657, 8723,
    8759, 8763, 8785, 8802, 8813, 8826, 8854, 8856, 8866, 8918,
    8922, 8923, 8932, 8958, 8967, 9003, 9018, 9078, 9095, 9104,
    9112, 9129, 9147, 9170, 9171, 9197, 9200, 9249, 9253, 9270,
    9282, 9288, 9295, 9321, 9323, 9324, 9347, 9399, 9403, 9417,
    9426, 9427, 9439, 9468, 9486, 9496, 9511, 9516, 9518, 9529,
    9557, 9563, 9564, 9584, 9586, 9591, 9599, 9600, 9601, 9632,
    9654, 9667, 9678, 9696, 9716, 9723, 9740, 9820, 9824, 9825,
    9828, 9863, 9866, 9868, 9889, 9929, 9938, 9953, 9967, 10019,
    10020, 10025, 10059, 10111, 10118, 10125, 10174, 10194,
    10201, 10202, 10220, 10221, 10226, 10242, 10250, 10276,
    10295, 10302, 10305, 10327, 10351, 10360, 10369, 10393,
    10407, 10438, 10455, 10463, 10465, 10470, 10478, 10503,
    10508, 10509, 10809, 11080, 11331, 11607, 11610, 11864,
    12390, 12393, 12396, 12399, 12671, 12921, 12930, 13178,
    13453, 13717, 14499, 14517, 14775, 15297, 15556, 15834,
    15839, 16126, 16127, 16386, 16633, 16644, 16651, 17166,
    17169, 17958, 17959, 17962, 18224, 21176, 21180, 21190,
    21802, 21803, 21806, 22584, 22857, 22858, 22866]


class FlyingChairsOcc(data.Dataset):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 dstype="train"):

        self._args = args

        # -------------------------------------------------------------
        # filenames for all input images and target flows
        # -------------------------------------------------------------
        image1_filenames = sorted(glob(os.path.join(root, "*_img1.png")))
        image2_filenames = sorted(glob(os.path.join(root, "*_img2.png")))
        occ1_filenames = sorted(glob(os.path.join(root, "*_occ1.png")))
        occ2_filenames = sorted(glob(os.path.join(root, "*_occ2.png")))
        flow_f_filenames = sorted(glob(os.path.join(root, "*_flow.flo")))
        flow_b_filenames = sorted(glob(os.path.join(root, "*_flow_b.flo")))
        assert (len(image1_filenames) == len(image2_filenames))
        assert (len(image2_filenames) == len(occ1_filenames))
        assert (len(occ1_filenames) == len(occ2_filenames))
        assert (len(occ2_filenames) == len(flow_f_filenames))
        assert (len(flow_f_filenames) == len(flow_b_filenames))

        num_flows = len(flow_f_filenames)

        # -------------------------------------------------------------
        # Remove invalid validation indices
        # -------------------------------------------------------------
        validate_indices = [x for x in VALIDATE_INDICES if x in range(num_flows)]

        # ----------------------------------------------------------
        # Construct list of indices for training/validation
        # ----------------------------------------------------------
        list_of_indices = None
        if dstype == "train":
            list_of_indices = [x for x in range(num_flows) if x not in validate_indices]
        elif dstype == "valid":
            list_of_indices = validate_indices
        elif dstype == "full":
            list_of_indices = range(num_flows)
        else:
            raise ValueError("FlyingChairs: dstype '%s' unknown!", dstype)

        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------
        self._image_list = []
        self._flow_list = []
        self._occ_list = []
        for i in list_of_indices:
            flo_f = flow_f_filenames[i]
            flo_b = flow_b_filenames[i]
            im1 = image1_filenames[i]
            im2 = image2_filenames[i]
            occ1 = occ1_filenames[i]
            occ2 = occ2_filenames[i]
            self._image_list += [[im1, im2]]
            self._flow_list += [[flo_f, flo_b]]
            self._occ_list += [[occ1, occ2]]
        self._size = len(self._image_list)
        assert len(self._image_list) == len(self._flow_list)
        assert len(self._occ_list) == len(self._flow_list)

        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)

        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)

    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]
        flo_f_filename = self._flow_list[index][0]
        flo_b_filename = self._flow_list[index][1]
        occ1_filename = self._occ_list[index][0]
        occ2_filename = self._occ_list[index][1]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)
        flo_f_np0 = common.read_flo_as_float32(flo_f_filename)
        flo_b_np0 = common.read_flo_as_float32(flo_b_filename)
        occ1_np0 = common.read_occ_image_as_float32(occ1_filename)
        occ2_np0 = common.read_occ_image_as_float32(occ2_filename)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)

        # convert flow to FloatTensor
        flo_f = common.numpy2torch(flo_f_np0)
        flo_b = common.numpy2torch(flo_b_np0)

        # convert occ to FloatTensor
        occ1 = common.numpy2torch(occ1_np0)
        occ2 = common.numpy2torch(occ2_np0)

        # example filename
        basename = os.path.basename(im1_filename)[:5]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "target1": flo_f,
            "target2": flo_b,
            "target_occ1": occ1,
            "target_occ2": occ2,
            "index": index,
            "basename": basename
        }

        return example_dict

    def __len__(self):
        return self._size


class FlyingChairsOccTrain(FlyingChairsOcc):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True):
        super(FlyingChairsOccTrain, self).__init__(
            args,
            root=root,
            photometric_augmentations=photometric_augmentations,
            dstype="train")


class FlyingChairsOccValid(FlyingChairsOcc):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False):
        super(FlyingChairsOccValid, self).__init__(
            args,
            root=root,
            photometric_augmentations=photometric_augmentations,
            dstype="valid")


class FlyingChairsOccFull(FlyingChairsOcc):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False):
        super(FlyingChairsOccFull, self).__init__(
            args,
            root=root,
            photometric_augmentations=photometric_augmentations,
            dstype="full")


================================================
FILE: datasets/kitti_combined.py
================================================
from __future__ import absolute_import, division, print_function

import os
import torch.utils.data as data
from glob import glob

from torchvision import transforms as vision_transforms

from . import transforms
from . import common

import numpy as np
import png

VALIDATE_INDICES_2015 = [10, 11, 12, 25, 26, 30, 31, 40, 41, 42, 46, 52, 53, 72, 73, 74, 75, 76, 80, 81, 85, 86, 95, 96, 97, 98, 104, 116, 117, 120, 121, 126, 127, 153, 172, 175, 183, 184, 190, 199]
VALIDATE_INDICES_2012 = [0, 12, 15, 16, 17, 18, 24, 30, 38, 39, 42, 50, 54, 59, 60, 61, 77, 78, 81, 89, 97, 101, 107, 121, 124, 142, 145, 146, 152, 154, 155, 158, 159, 160, 164, 182, 183, 184, 190]


def read_png_flow(flow_file):
    flow_object = png.Reader(filename=flow_file)
    flow_direct = flow_object.asDirect()
    flow_data = list(flow_direct[2])
    (w, h) = flow_direct[3]['size']
    flow = np.zeros((h, w, 3), dtype=np.float64)
    for i in range(len(flow_data)):
        flow[i, :, 0] = flow_data[i][0::3]
        flow[i, :, 1] = flow_data[i][1::3]
        flow[i, :, 2] = flow_data[i][2::3]

    invalid_idx = (flow[:, :, 2] == 0)
    flow[:, :, 0:2] = (flow[:, :, 0:2] - 2 ** 15) / 64.0
    flow[invalid_idx, 0] = 0
    flow[invalid_idx, 1] = 0
    return flow[:, :, 0:2], (1 - invalid_idx * 1)[:, :, None]


def kitti_random_crop(im1, im2, flo_f, valid_mask, crop_height=370, crop_width=1224):
    height, width, _ = im1.shape
    # get starting positions
    x = np.random.uniform(0, width - crop_width + 1)
    y = np.random.uniform(0, height - crop_height + 1)
    str_x = int(x)
    str_y = int(y)
    end_x = int(x + crop_width)
    end_y = int(y + crop_height)

    im1 = im1[str_y:end_y, str_x:end_x, :]
    im2 = im2[str_y:end_y, str_x:end_x, :]
    flo_f = flo_f[str_y:end_y, str_x:end_x, :]
    valid_mask = valid_mask[str_y:end_y, str_x:end_x, :]

    return im1, im2, flo_f, valid_mask


class Kitti_comb_test(data.Dataset):
    def __init__(self,
                 args,
                 images_root_2015=None,
                 images_root_2012=None,
                 photometric_augmentations=False,
                 preprocessing_crop=True):

        self._args = args
        self.preprocessing_crop = preprocessing_crop

        list_of_indices_2012 = []
        list_of_indices_2015 = []

        # ----------------------------------------------------------
        # KITTI 2015
        # ----------------------------------------------------------        
        if images_root_2015 is not None:

            if not os.path.isdir(images_root_2015):
                raise ValueError("Image directory not found! {}".format(images_root_2015))

            all_img1_2015_filenames = sorted(glob(os.path.join(images_root_2015, "*_10.png")))
            all_img2_2015_filenames = sorted(glob(os.path.join(images_root_2015, "*_11.png")))
            assert len(all_img1_2015_filenames) != 0
            assert len(all_img2_2015_filenames) == len(all_img1_2015_filenames)
            list_of_indices_2015 = range(len(all_img1_2015_filenames))           

        # ----------------------------------------------------------
        # KITTI 2012
        # ----------------------------------------------------------        
        if images_root_2012 is not None:

            if not os.path.isdir(images_root_2012):
                raise ValueError("Image directory not found! {}".format(images_root_2012))

            all_img1_2012_filenames = sorted(glob(os.path.join(images_root_2012, "*_10.png")))
            all_img2_2012_filenames = sorted(glob(os.path.join(images_root_2012, "*_11.png")))
            assert len(all_img1_2012_filenames) != 0
            assert len(all_img2_2012_filenames) == len(all_img1_2012_filenames)
            list_of_indices_2012 = range(len(all_img1_2012_filenames))

        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------
        self._image_list = []
        self._flow_list = []

        for ii in list_of_indices_2015:

            im1 = all_img1_2015_filenames[ii]
            im2 = all_img2_2015_filenames[ii]
            idx1 = os.path.splitext(os.path.basename(im1))[0][:-3]
            idx2 = os.path.splitext(os.path.basename(im2))[0][:-3]
            assert idx1 == idx2

            if not os.path.isfile(im1) or not os.path.isfile(im2):
                continue

            self._image_list += [[im1, im2]]


        for ii in list_of_indices_2012:

            im1 = all_img1_2012_filenames[ii]
            im2 = all_img2_2012_filenames[ii]
            idx1 = os.path.splitext(os.path.basename(im1))[0][:-3]
            idx2 = os.path.splitext(os.path.basename(im2))[0][:-3]
            assert idx1 == idx2

            if not os.path.isfile(im1) or not os.path.isfile(im2):
                continue

            self._image_list += [[im1, im2]]

        self._size = len(self._image_list)

        assert len(self._image_list) != 0


        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)

        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)


    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)

        # example filename
        basename = os.path.basename(im1_filename)[:6]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "index": index,
            "basename": basename
        }

        return example_dict

    def __len__(self):
        return self._size


class Kitti_comb(data.Dataset):
    def __init__(self,
                 args,
                 images_root_2015=None,
                 flow_root_2015=None,
                 images_root_2012=None,
                 flow_root_2012=None,
                 photometric_augmentations=False,
                 preprocessing_crop=True,
                 dstype="full"):

        self._args = args
        self.preprocessing_crop = preprocessing_crop

        list_of_indices_2012 = []
        list_of_indices_2015 = []

        # ----------------------------------------------------------
        # KITTI 2015
        # ----------------------------------------------------------        
        if images_root_2015 is not None and flow_root_2015 is not None:

            if not os.path.isdir(images_root_2015):
                raise ValueError("Image directory not found!  {}".format(images_root_2015))
            if not os.path.isdir(flow_root_2015):
                raise ValueError("Flow directory not found!  {}".format(flow_root_2015))

            all_img1_2015_filenames = sorted(glob(os.path.join(images_root_2015, "*_10.png")))
            all_img2_2015_filenames = sorted(glob(os.path.join(images_root_2015, "*_11.png")))            
            flow_f_2015_filenames = sorted(glob(os.path.join(flow_root_2015, "*_10.png")))
            assert len(all_img1_2015_filenames) != 0
            assert len(all_img2_2015_filenames) == len(all_img1_2015_filenames)
            assert len(flow_f_2015_filenames) == len(all_img1_2015_filenames)
            num_flows_2015 = len(flow_f_2015_filenames)           
            validate_indices_2015 = [x for x in VALIDATE_INDICES_2015 if x in range(num_flows_2015)]

            if dstype == "train":
                list_of_indices_2015 = [x for x in range(num_flows_2015) if x not in validate_indices_2015]
            elif dstype == "valid":
                list_of_indices_2015 = validate_indices_2015
            elif dstype == "full":
                list_of_indices_2015 = range(len(all_img1_2015_filenames))
            else:
                raise ValueError("KITTI 2015: dstype '%s' unknown!", dstype)


        # ----------------------------------------------------------
        # KITTI 2012
        # ----------------------------------------------------------        
        if images_root_2012 is not None:

            if not os.path.isdir(images_root_2012):
                raise ValueError("Image directory '%s' not found!")
            if not os.path.isdir(flow_root_2012):
                raise ValueError("Flow directory '%s' not found!")

            all_img1_2012_filenames = sorted(glob(os.path.join(images_root_2012, "*_10.png")))
            all_img2_2012_filenames = sorted(glob(os.path.join(images_root_2012, "*_11.png")))            
            flow_f_2012_filenames = sorted(glob(os.path.join(flow_root_2012, "*_10.png")))
            assert len(all_img1_2012_filenames) != 0
            assert len(all_img2_2012_filenames) == len(all_img1_2012_filenames)
            assert len(flow_f_2012_filenames) == len(all_img1_2012_filenames)
            num_flows_2012 = len(flow_f_2012_filenames)           
            validate_indices_2012 = [x for x in VALIDATE_INDICES_2012 if x in range(num_flows_2012)]

            if dstype == "train":
                list_of_indices_2012 = [x for x in range(num_flows_2012) if x not in validate_indices_2012]
            elif dstype == "valid":
                list_of_indices_2012 = validate_indices_2012
            elif dstype == "full":
                list_of_indices_2012 = range(len(all_img1_2012_filenames))
            else:
                raise ValueError("KITTI 2012: dstype '%s' unknown!", dstype)


        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------
        self._image_list = []
        self._flow_list = []

        for ii in list_of_indices_2015:

            im1 = all_img1_2015_filenames[ii]
            im2 = all_img2_2015_filenames[ii]
            idx1 = os.path.splitext(os.path.basename(im1))[0][:-3]
            idx2 = os.path.splitext(os.path.basename(im2))[0][:-3]
            assert idx1 == idx2

            if not os.path.isfile(im1) or not os.path.isfile(im2):
                continue

            self._image_list += [[im1, im2]]

            if dstype is not "test":
                flo_f = flow_f_2015_filenames[ii]
                idx_f = os.path.splitext(os.path.basename(flo_f))[0][:-3]
                assert idx1 == idx_f                
                if not os.path.isfile(flo_f):
                    continue
                self._flow_list += [[flo_f]]


        for ii in list_of_indices_2012:

            im1 = all_img1_2012_filenames[ii]
            im2 = all_img2_2012_filenames[ii]
            idx1 = os.path.splitext(os.path.basename(im1))[0][:-3]
            idx2 = os.path.splitext(os.path.basename(im2))[0][:-3]
            assert idx1 == idx2

            if not os.path.isfile(im1) or not os.path.isfile(im2):
                continue

            self._image_list += [[im1, im2]]

            if dstype is not "test":
                flo_f = flow_f_2012_filenames[ii]
                idx_f = os.path.splitext(os.path.basename(flo_f))[0][:-3]
                assert idx1 == idx_f                
                if not os.path.isfile(flo_f):
                    continue
                self._flow_list += [[flo_f]]


        self._size = len(self._image_list)

        assert len(self._image_list) != 0
        if dstype is not "test":
            assert len(self._image_list) == len(self._flow_list)

        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)

        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)


    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]
        flo_f_filename = self._flow_list[index][0]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)
        flo_f_np0, valid_mask = read_png_flow(flo_f_filename)

        if self.preprocessing_crop:
            im1_np0, im2_np0, flo_f_np0, valid_mask = kitti_random_crop(im1_np0, im2_np0, flo_f_np0, valid_mask)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)

        # convert flow to FloatTensor
        flo_f = common.numpy2torch(flo_f_np0)
        valid_mask_f = common.numpy2torch(valid_mask)

        # example filename
        basename = os.path.basename(im1_filename)[:6]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "target1": flo_f,
            "target2": flo_f,
            "index": index,
            "basename": basename,
            "input_valid": valid_mask_f
        }

        return example_dict

    def __len__(self):
        return self._size


class KittiCombTrain(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True,
                 preprocessing_crop=True):
        images_root_2015 = os.path.join(root, "data_scene_flow", "training", "image_2")
        flow_root_2015 = os.path.join(root, "data_scene_flow", "training", "flow_occ")
        images_root_2012 = os.path.join(root, "data_stereo_flow", "training", "colored_0")
        flow_root_2012 = os.path.join(root, "data_stereo_flow",  "training", "flow_occ")
        super(KittiCombTrain, self).__init__(
            args,
            images_root_2015=images_root_2015,
            flow_root_2015=flow_root_2015,
            images_root_2012=images_root_2012,
            flow_root_2012=flow_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="train")


class KittiCombVal(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 preprocessing_crop=False):
        images_root_2015 = os.path.join(root, "data_scene_flow", "training", "image_2")
        flow_root_2015 = os.path.join(root, "data_scene_flow", "training", "flow_occ")
        images_root_2012 = os.path.join(root, "data_stereo_flow", "training", "colored_0")
        flow_root_2012 = os.path.join(root, "data_stereo_flow",  "training", "flow_occ")
        super(KittiCombVal, self).__init__(
            args,
            images_root_2015=images_root_2015,
            flow_root_2015=flow_root_2015,
            images_root_2012=images_root_2012,
            flow_root_2012=flow_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="valid")


class KittiCombFull(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True,
                 preprocessing_crop=True):
        images_root_2015 = os.path.join(root, "data_scene_flow", "training", "image_2")
        flow_root_2015 = os.path.join(root, "data_scene_flow", "training", "flow_occ")
        images_root_2012 = os.path.join(root, "data_stereo_flow", "training", "colored_0")
        flow_root_2012 = os.path.join(root, "data_stereo_flow",  "training", "flow_occ")
        super(KittiCombFull, self).__init__(
            args,
            images_root_2015=images_root_2015,
            flow_root_2015=flow_root_2015,
            images_root_2012=images_root_2012,
            flow_root_2012=flow_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="full")


class KittiComb2015Train(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True,
                 preprocessing_crop=True):
        images_root_2015 = os.path.join(root, "data_scene_flow", "training", "image_2")
        flow_root_2015 = os.path.join(root, "data_scene_flow", "training", "flow_occ")
        super(KittiComb2015Train, self).__init__(
            args,
            images_root_2015=images_root_2015,
            flow_root_2015=flow_root_2015,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="train")


class KittiComb2015Val(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 preprocessing_crop=False):
        images_root_2015 = os.path.join(root, "data_scene_flow", "training", "image_2")
        flow_root_2015 = os.path.join(root, "data_scene_flow", "training", "flow_occ")
        super(KittiComb2015Val, self).__init__(
            args,
            images_root_2015=images_root_2015,
            flow_root_2015=flow_root_2015,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="valid")


class KittiComb2015Full(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True,
                 preprocessing_crop=True):
        images_root_2015 = os.path.join(root, "data_scene_flow", "training", "image_2")
        flow_root_2015 = os.path.join(root, "data_scene_flow", "training", "flow_occ")
        super(KittiComb2015Full, self).__init__(
            args,
            images_root_2015=images_root_2015,
            flow_root_2015=flow_root_2015,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="full")


class KittiComb2015Test(Kitti_comb_test):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 preprocessing_crop=False):
        images_root_2015 = os.path.join(root, "data_scene_flow", "testing", "image_2")
        super(KittiComb2015Test, self).__init__(
            args,
            images_root_2015=images_root_2015,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop)


class KittiComb2012Train(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True,
                 preprocessing_crop=True):
        images_root_2012 = os.path.join(root, "data_stereo_flow", "training", "colored_0")
        flow_root_2012 = os.path.join(root, "data_stereo_flow",  "training", "flow_occ")
        super(KittiComb2012Train, self).__init__(
            args,
            images_root_2012=images_root_2012,
            flow_root_2012=flow_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="train")


class KittiComb2012Val(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 preprocessing_crop=False):
        images_root_2012 = os.path.join(root, "data_stereo_flow", "training", "colored_0")
        flow_root_2012 = os.path.join(root, "data_stereo_flow",  "training", "flow_occ")
        super(KittiComb2012Val, self).__init__(
            args,
            images_root_2012=images_root_2012,
            flow_root_2012=flow_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="valid")


class KittiComb2012Full(Kitti_comb):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=True,
                 preprocessing_crop=True):
        images_root_2012 = os.path.join(root, "data_stereo_flow", "training", "colored_0")
        flow_root_2012 = os.path.join(root, "data_stereo_flow",  "training", "flow_occ")
        super(KittiComb2012Full, self).__init__(
            args,
            images_root_2012=images_root_2012,
            flow_root_2012=flow_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop,
            dstype="full")


class KittiComb2012Test(Kitti_comb_test):
    def __init__(self,
                 args,
                 root,
                 photometric_augmentations=False,
                 preprocessing_crop=False):
        images_root_2012 = os.path.join(root, "data_stereo_flow", "testing", "colored_0")
        super(KittiComb2012Test, self).__init__(
            args,
            images_root_2012=images_root_2012,
            photometric_augmentations=photometric_augmentations,
            preprocessing_crop=preprocessing_crop)

================================================
FILE: datasets/sintel.py
================================================
from __future__ import absolute_import, division, print_function

import os
import torch.utils.data as data
from glob import glob

from torchvision import transforms as vision_transforms

from . import transforms
from . import common

import tools


VALIDATE_INDICES = [
    199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
    211, 212, 213, 214, 215, 216, 217, 340, 341, 342, 343, 344,
    345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
    357, 358, 359, 360, 361, 362, 363, 364, 536, 537, 538, 539,
    540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551,
    552, 553, 554, 555, 556, 557, 558, 559, 560, 659, 660, 661,
    662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673,
    674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685,
    686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697,
    967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978,
    979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990,
    991]


class _Sintel(data.Dataset):
    def __init__(self,
                 args,
                 dir_root=None,
                 photometric_augmentations=False,
                 imgtype=None,
                 dstype=None):

        self._args = args

        images_root = os.path.join(dir_root, imgtype)
        if imgtype is "comb":
            images_root = os.path.join(dir_root, "clean")
        flow_root = os.path.join(dir_root, "flow")
        occ_root = os.path.join(dir_root, "occlusions_rev")

        if not os.path.isdir(images_root):
            raise ValueError("Image directory '%s' not found!")
        if flow_root is not None and not os.path.isdir(flow_root):
            raise ValueError("Flow directory '%s' not found!")
        if occ_root is not None and not os.path.isdir(occ_root):
            raise ValueError("Occ directory '%s' not found!")
        
        all_flo_filenames = sorted(glob(os.path.join(flow_root, "*/*.flo")))
        all_occ_filenames = sorted(glob(os.path.join(occ_root, "*/*.png")))
        all_img_filenames = sorted(glob(os.path.join(images_root, "*/*.png")))

        # Remember base for substraction at runtime
        # e.g. subtract_base = "/home/user/.../MPI-Sintel-Complete/training/clean"
        self._substract_base = tools.cd_dotdot(images_root)

        # ------------------------------------------------------------------------
        # Get unique basenames
        # ------------------------------------------------------------------------
        # e.g. base_folders = [alley_1", "alley_2", "ambush_2", ...]
        substract_full_base = tools.cd_dotdot(all_img_filenames[0])
        base_folders = sorted(list(set([
            os.path.dirname(fn.replace(substract_full_base, ""))[1:] for fn in all_img_filenames
        ])))

        self._image_list = []
        self._flow_list = []
        self._occ_list = []

        for base_folder in base_folders:            
            img_filenames = [x for x in all_img_filenames if base_folder in x]
            flo_filenames = [x for x in all_flo_filenames if base_folder in x]
            occ_filenames = [x for x in all_occ_filenames if base_folder in x]

            for i in range(len(img_filenames) - 1):

                im1 = img_filenames[i]
                im2 = img_filenames[i + 1]
                flo = flo_filenames[i]
                occ = occ_filenames[i]

                self._image_list += [[im1, im2]]
                self._flow_list += [flo]
                self._occ_list += [occ]

                # Sanity check
                im1_base_filename = os.path.splitext(os.path.basename(im1))[0]
                im2_base_filename = os.path.splitext(os.path.basename(im2))[0]
                flo_base_filename = os.path.splitext(os.path.basename(flo))[0]
                occ_base_filename = os.path.splitext(os.path.basename(occ))[0]
                im1_frame, im1_no = im1_base_filename.split("_")
                im2_frame, im2_no = im2_base_filename.split("_")
                assert(im1_frame == im2_frame)
                assert(int(im1_no) == int(im2_no) - 1)
                
                flo_frame, flo_no = flo_base_filename.split("_")
                assert(im1_frame == flo_frame)
                assert(int(im1_no) == int(flo_no))
                
                occ_frame, occ_no = occ_base_filename.split("_")
                assert(im1_frame == occ_frame)
                assert(int(im1_no) == int(occ_no))
        
        assert len(self._image_list) == len(self._flow_list)        
        assert len(self._image_list) == len(self._occ_list)

        # -------------------------------------------------------------
        # Remove invalid validation indices
        # -------------------------------------------------------------
        full_num_examples = len(self._image_list)
        validate_indices = [x for x in VALIDATE_INDICES if x in range(full_num_examples)]

        # ----------------------------------------------------------
        # Construct list of indices for training/validation
        # ----------------------------------------------------------
        list_of_indices = None
        if dstype == "train":
            list_of_indices = [x for x in range(full_num_examples) if x not in validate_indices]
        elif dstype == "valid":
            list_of_indices = validate_indices
        elif dstype == "full":
            list_of_indices = range(full_num_examples)
        else:
            raise ValueError("dstype '%s' unknown!", dstype)

        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------
        self._image_list = [self._image_list[i] for i in list_of_indices]
        self._flow_list = [self._flow_list[i] for i in list_of_indices]
        self._occ_list = [self._occ_list[i] for i in list_of_indices]

        if imgtype is "comb":
            image_list_final = [[val[0].replace("clean", "final"), val[1].replace("clean", "final")] for idx, val in enumerate(self._image_list)]
            self._image_list += image_list_final
            self._flow_list += self._flow_list
            self._occ_list += self._occ_list

        assert len(self._image_list) == len(self._flow_list)
        assert len(self._image_list) == len(self._occ_list)

        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)

        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)

        self._size = len(self._image_list)

    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]
        flo_filename = self._flow_list[index]
        occ_filename = self._occ_list[index]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)
        flo_np0 = common.read_flo_as_float32(flo_filename)
        occ_np0 = common.read_occ_image_as_float32(occ_filename)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)
        flo = common.numpy2torch(flo_np0)
        occ = common.numpy2torch(occ_np0)

        # e.g. "clean/alley_1/"
        basedir = os.path.splitext(os.path.dirname(im1_filename).replace(self._substract_base, "")[1:])[0]

        # example filename
        basename = os.path.splitext(os.path.basename(im1_filename))[0]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "index": index,
            "basedir": basedir,
            "basename": basename,
            "target1": flo,
            "target_occ1": occ
        }

        return example_dict

    def __len__(self):
        return self._size


class _Sintel_test(data.Dataset):
    def __init__(self,
                 args,
                 dir_root=None,
                 photometric_augmentations=False,
                 imgtype=None):

        self._args = args
        images_root = os.path.join(dir_root, imgtype)
        if not os.path.isdir(images_root):
            raise ValueError("Image directory '%s' not found!")

        all_img_filenames = sorted(glob(os.path.join(images_root, "*/*.png")))

        # Remember base for substraction at runtime
        # e.g. subtract_base = "/home/user/.../MPI-Sintel-Complete/training/clean"
        self._substract_base = tools.cd_dotdot(images_root)

        # ------------------------------------------------------------------------
        # Get unique basenames
        # ------------------------------------------------------------------------
        # e.g. base_folders = [alley_1", "alley_2", "ambush_2", ...]
        substract_full_base = tools.cd_dotdot(all_img_filenames[0])
        base_folders = sorted(list(set([
            os.path.dirname(fn.replace(substract_full_base, ""))[1:] for fn in all_img_filenames
        ])))

        self._image_list = []

        for base_folder in base_folders:            
            img_filenames = [x for x in all_img_filenames if base_folder in x]

            for i in range(len(img_filenames) - 1):

                im1 = img_filenames[i]
                im2 = img_filenames[i + 1]
                self._image_list += [[im1, im2]]

                # Sanity check
                im1_base_filename = os.path.splitext(os.path.basename(im1))[0]
                im2_base_filename = os.path.splitext(os.path.basename(im2))[0]
                im1_frame, im1_no = im1_base_filename.split("_")
                im2_frame, im2_no = im2_base_filename.split("_")
                assert(im1_frame == im2_frame)
                assert(int(im1_no) == int(im2_no) - 1)                

        full_num_examples = len(self._image_list)
        list_of_indices = range(full_num_examples)

        # ----------------------------------------------------------
        # Save list of actual filenames for inputs and flows
        # ----------------------------------------------------------
        self._image_list = [self._image_list[i] for i in list_of_indices]
        
        # ----------------------------------------------------------
        # photometric_augmentations
        # ----------------------------------------------------------
        if photometric_augmentations:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> PIL
                vision_transforms.ToPILImage(),
                # PIL -> PIL : random hsv and contrast
                vision_transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                # PIL -> FloatTensor
                vision_transforms.transforms.ToTensor(),
                transforms.RandomGamma(min_gamma=0.7, max_gamma=1.5, clip_image=True),
            ], from_numpy=True, to_numpy=False)

        else:
            self._photometric_transform = transforms.ConcatTransformSplitChainer([
                # uint8 -> FloatTensor
                vision_transforms.transforms.ToTensor(),
            ], from_numpy=True, to_numpy=False)

        self._size = len(self._image_list)

    def __getitem__(self, index):
        index = index % self._size

        im1_filename = self._image_list[index][0]
        im2_filename = self._image_list[index][1]

        # read float32 images and flow
        im1_np0 = common.read_image_as_byte(im1_filename)
        im2_np0 = common.read_image_as_byte(im2_filename)

        # possibly apply photometric transformations
        im1, im2 = self._photometric_transform(im1_np0, im2_np0)

        # e.g. "clean/alley_1/"
        basedir = os.path.splitext(os.path.dirname(im1_filename).replace(self._substract_base, "")[1:])[0]

        # example filename
        basename = os.path.splitext(os.path.basename(im1_filename))[0]

        example_dict = {
            "input1": im1,
            "input2": im2,
            "index": index,
            "basedir": basedir,
            "basename": basename
        }

        return example_dict

    def __len__(self):
        return self._size


class SintelTrainingCleanTrain(_Sintel):
    def __init__(self, args, root, photometric_augmentations=True):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingCleanTrain, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="clean",
            dstype="train")


class SintelTrainingCleanValid(_Sintel):
    def __init__(self, args, root, photometric_augmentations=False):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingCleanValid, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="clean",
            dstype="valid")


class SintelTrainingCleanFull(_Sintel):
    def __init__(self, args, root, photometric_augmentations=True):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingCleanFull, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="clean",
            dstype="full")


class SintelTrainingFinalTrain(_Sintel):
    def __init__(self, args, root, photometric_augmentations=True):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingFinalTrain, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="final",
            dstype="train")


class SintelTrainingFinalValid(_Sintel):
    def __init__(self, args, root, photometric_augmentations=False):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingFinalValid, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="final",
            dstype="valid")


class SintelTrainingFinalFull(_Sintel):
    def __init__(self, args, root, photometric_augmentations=True):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingFinalFull, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="final",
            dstype="full")


class SintelTrainingCombTrain(_Sintel):
    def __init__(self, args, root, photometric_augmentations=True):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingCombTrain, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="comb",
            dstype="train")


class SintelTrainingCombValid(_Sintel):
    def __init__(self, args, root, photometric_augmentations=False):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingCombValid, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="comb",
            dstype="valid")


class SintelTrainingCombFull(_Sintel):
    def __init__(self, args, root, photometric_augmentations=True):
        dir_root = os.path.join(root, "training")
        super(SintelTrainingCombFull, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="comb",
            dstype="full")


class SintelTestClean(_Sintel_test):
    def __init__(self, args, root, photometric_augmentations=False):
        dir_root = os.path.join(root, "test")
        super(SintelTestClean, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="clean")


class SintelTestFinal(_Sintel_test):
    def __init__(self, args, root, photometric_augmentations=False):
        dir_root = os.path.join(root, "test")
        super(SintelTestFinal, self).__init__(
            args,
            dir_root=dir_root,
            photometric_augmentations=photometric_augmentations,
            imgtype="final")


================================================
FILE: datasets/transforms.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import numpy as np
import torch


def image_random_gamma(image, min_gamma=0.7, max_gamma=1.5, clip_image=False):
    gamma = np.random.uniform(min_gamma, max_gamma)
    adjusted = torch.pow(image, gamma)
    if clip_image:
        adjusted.clamp_(0.0, 1.0)
    return adjusted


class RandomGamma:
    def __init__(self, min_gamma=0.7, max_gamma=1.5, clip_image=False):
        self._min_gamma = min_gamma
        self._max_gamma = max_gamma
        self._clip_image = clip_image

    def __call__(self, image):
        return image_random_gamma(
            image,
            min_gamma=self._min_gamma,
            max_gamma=self._max_gamma,
            clip_image=self._clip_image)


# ------------------------------------------------------------------
# Allow transformation chains of the type:
#   im1, im2, .... = transform(im1, im2, ...)
# ------------------------------------------------------------------
class TransformChainer:
    def __init__(self, list_of_transforms):
        self._list_of_transforms = list_of_transforms

    def __call__(self, *args):
        list_of_args = list(args)
        for transform in self._list_of_transforms:
            list_of_args = [transform(arg) for arg in list_of_args]
        if len(args) == 1:
            return list_of_args[0]
        else:
            return list_of_args


# ------------------------------------------------------------------
# Allow transformation chains of the type:
#   im1, im2, .... = split( transform( concatenate(im1, im2, ...) ))
# ------------------------------------------------------------------
class ConcatTransformSplitChainer:
    def __init__(self, list_of_transforms, from_numpy=True, to_numpy=False):
        self._chainer = TransformChainer(list_of_transforms)
        self._from_numpy = from_numpy
        self._to_numpy = to_numpy

    def __call__(self, *args):
        num_splits = len(args)

        if self._from_numpy:
            concatenated = np.concatenate(args, axis=0)
        else:
            concatenated = torch.cat(args, dim=1)

        transformed = self._chainer(concatenated)

        if self._to_numpy:
            split = np.split(transformed, indices_or_sections=num_splits, axis=0)
        else:
            split = torch.chunk(transformed, num_splits, dim=1)

        return split


================================================
FILE: flyingchairsocc/README.md
================================================
# FlyingChairsOcc dataset

<img src=demo_img.png>

The FlyingChairsOcc dataset is an extended version of the <a href="https://lmb.informatik.uni-freiburg.de/resources/datasets/FlyingChairs.en.html" target="_blank">Flying Chairs Dataset</a>, including bi-directional optical flow ground truth and two occlusion maps for each image.
You may also find that another concurrent dataset, the <a href="https://lmb.informatik.uni-freiburg.de/resources/datasets/FlyingChairs.en.html" target="_blank">Flying Chairs 2 Dataset</a>, is useful.


## License agreement

This dataset is made freely available to academic and non-academic entities for non-commercial purposes such as academic research, teaching, scientific publications, or personal experimentation. Permission is granted to use the data given that you agree:

1. That the dataset comes “AS IS”, without express or implied warranty. Although every effort has been made to ensure accuracy, we (TU Darmstadt) do not accept any responsibility for errors or omissions.
2. That you include a reference to the FlyingChairsOcc Dataset in any work that makes use of the dataset. For research papers, cite our publication: J. Hur and S. Roth, “Iterative residual refinement for joint optical flow and occlusion estimation,” in Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, California, June 2019
3. That you do not distribute this dataset or modified versions. It is permissible to distribute derivative works in as far as they are abstract representations of this dataset (such as models trained on it or additional annotations that do not directly include any of our data) and do not allow to recover the dataset or something similar in character.
4. That you may not use the dataset or any derivative work for commercial purposes as, for example, licensing or selling the data, or using the data with a purpose to procure a commercial gain.
5. That all rights not expressly granted to you are reserved by us (TU Darmstadt).


## Download link

<a href="https://download.visinf.tu-darmstadt.de/data/flyingchairs_occ/FlyingChairsOcc.tar.gz" target="_blank"><b>Download</b></a>
(82GB)


## Reference

Please cite the paper below if you find the dataset and source codes are useful.  

    @inproceedings{Hur:2019:IRR,  
      Author = {Junhwa Hur and Stefan Roth},  
      Booktitle = {CVPR},  
      Title = {Iterative Residual Refinement for Joint Optical Flow and Occlusion Estimation},  
      Year = {2019}  
    }

Contact: junhwa.hur[at]visinf.tu-darmstadt.de


================================================
FILE: install.sh
================================================
#!/bin/bash
cd ./models/correlation_package
python setup.py install
cd ..


================================================
FILE: logger.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import colorama
import logging
import os
import re
import tools
import sys


def get_default_logging_format(colorize=False, brackets=False):
    style = colorama.Style.DIM if colorize else ''
    # color = colorama.Fore.CYAN if colorize else ''
    color = colorama.Fore.WHITE if colorize else ''
    reset = colorama.Style.RESET_ALL if colorize else ''
    if brackets:
        result = "{}{}[%(asctime)s]{} %(message)s".format(style, color, reset)
    else:
        result = "{}{}%(asctime)s{} %(message)s".format(style, color, reset)
    return result


def get_default_logging_datefmt():
    return "%Y-%m-%d %H:%M:%S"


def log_module_info(module):
    lines = module.__str__().split("\n")
    for line in lines:
        logging.info(line)


class LogbookFormatter(logging.Formatter):
    def __init__(self, fmt=None, datefmt=None):
        super(LogbookFormatter, self).__init__(fmt=fmt, datefmt=datefmt)
        self._re = re.compile(r"\033\[[0-9]+m")

    def remove_colors_from_msg(self, msg):
        msg = re.sub(self._re, "", msg)
        return msg

    def format(self, record=None):
        record.msg = self.remove_colors_from_msg(record.msg)
        return super(LogbookFormatter, self).format(record)


class ConsoleFormatter(logging.Formatter):
    def __init__(self, fmt=None, datefmt=None):
        super(ConsoleFormatter, self).__init__(fmt=fmt, datefmt=datefmt)

    def format(self, record=None):
        indent = sys.modules[__name__].global_indent
        record.msg = " " * indent + record.msg
        return super(ConsoleFormatter, self).format(record)


class SkipLogbookFilter(logging.Filter):
    def filter(self, record):
        return record.levelno != logging.LOGBOOK


def configure_logging(filename=None):
    # set global indent level
    sys.modules[__name__].global_indent = 0

    # add custom tqdm logger
    tools.addLoggingLevel("LOGBOOK", 1000)

    # create logger
    root_logger = logging.getLogger("")
    root_logger.setLevel(logging.INFO)

    # create console handler and set level to debug
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    fmt = get_default_logging_format(colorize=True, brackets=False)
    datefmt = get_default_logging_datefmt()
    formatter = ConsoleFormatter(fmt=fmt, datefmt=datefmt)
    console.setFormatter(formatter)

    # Skip logging.tqdm requests for console outputs
    skip_logbook_filter = SkipLogbookFilter()
    console.addFilter(skip_logbook_filter)

    # add console to root_logger
    root_logger.addHandler(console)

    # add logbook
    if filename is not None:
        # ensure dir
        d = os.path.dirname(filename)
        if not os.path.exists(d):
            os.makedirs(d)

        # --------------------------------------------------------------------------------------
        # Configure handler that removes color codes from logbook
        # --------------------------------------------------------------------------------------
        logbook = logging.FileHandler(filename=filename, mode="a", encoding="utf-8")
        logbook.setLevel(logging.INFO)
        fmt = get_default_logging_format(colorize=False, brackets=True)
        logbook_formatter = LogbookFormatter(fmt=fmt, datefmt=datefmt)
        logbook.setFormatter(logbook_formatter)
        root_logger.addHandler(logbook)


class LoggingBlock:
    def __init__(self, title, emph=False):
        self._emph = emph
        bright = colorama.Style.BRIGHT
        cyan = colorama.Fore.CYAN
        reset = colorama.Style.RESET_ALL
        if emph:
            logging.info("%s==>%s %s%s%s" % (cyan, reset, bright, title, reset))
        else:
            logging.info(title)

    def __enter__(self):
        sys.modules[__name__].global_indent += 2
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        sys.modules[__name__].global_indent -= 2


================================================
FILE: losses.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
import torch.nn.functional as tf


def _elementwise_epe(input_flow, target_flow):
    residual = target_flow - input_flow
    return torch.norm(residual, p=2, dim=1, keepdim=True)

def _elementwise_robust_epe_char(input_flow, target_flow):
    residual = target_flow - input_flow
    return torch.pow(torch.norm(residual, p=2, dim=1, keepdim=True) + 0.01, 0.4)

def _downsample2d_as(inputs, target_as):
    _, _, h, w = target_as.size()
    return tf.adaptive_avg_pool2d(inputs, [h, w])

def _upsample2d_as(inputs, target_as, mode="bilinear"):
    _, _, h, w = target_as.size()
    return tf.interpolate(inputs, [h, w], mode=mode, align_corners=True)

def f1_score(y_true, y_pred):
    return fbeta_score(y_true, y_pred, 1)

def fbeta_score(y_true, y_pred, beta, eps=1e-8):
    beta2 = beta ** 2

    y_pred = y_pred.float()
    y_true = y_true.float()

    true_positive = (y_pred * y_true).sum(dim=2).sum(dim=2)
    precision = true_positive / (y_pred.sum(dim=2).sum(dim=2) + eps)
    recall = true_positive / (y_true.sum(dim=2).sum(dim=2) + eps)

    return torch.mean(precision * recall / (precision * beta2 + recall + eps) * (1 + beta2))

def f1_score_bal_loss(y_pred, y_true):
    eps = 1e-8

    tp = -(y_true * torch.log(y_pred + eps)).sum(dim=2).sum(dim=2).sum(dim=1)
    fn = -((1 - y_true) * torch.log((1 - y_pred) + eps)).sum(dim=2).sum(dim=2).sum(dim=1)

    denom_tp = y_true.sum(dim=2).sum(dim=2).sum(dim=1) + y_pred.sum(dim=2).sum(dim=2).sum(dim=1) + eps
    denom_fn = (1 - y_true).sum(dim=2).sum(dim=2).sum(dim=1) + (1 - y_pred).sum(dim=2).sum(dim=2).sum(dim=1) + eps

    return ((tp / denom_tp).sum() + (fn / denom_fn).sum()) * y_pred.size(2) * y_pred.size(3) * 0.5


class MultiScaleEPE_FlowNet(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_FlowNet, self).__init__()
        self._args = args        
        self._batch_size = args.batch_size
        self._weights = [0.005, 0.01, 0.02, 0.08, 0.32]

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs = [output_dict[key] for key in ["flow2", "flow3", "flow4", "flow5", "flow6"]]

            # div_flow trick
            target = self._args.model_div_flow * target_dict["target1"]

            total_loss = 0
            for i, output_i in enumerate(outputs):
                target_i = _downsample2d_as(target, output_i)
                epe_i = _elementwise_epe(output_i, target_i)
                total_loss = total_loss + self._weights[i] * epe_i.sum() / self._batch_size
                loss_dict["epe%i" % (i + 2)] = epe_i.mean()
            loss_dict["total_loss"] = total_loss
        else:
            output = output_dict["flow1"]
            target = target_dict["target1"]
            epe = _elementwise_epe(output, target)
            loss_dict["epe"] = epe.mean()

        return loss_dict

class MultiScaleEPE_FlowNet_IRR(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_FlowNet_IRR, self).__init__()
        self._args = args        
        self._batch_size = args.batch_size
        self._weights = [0.005, 0.01, 0.02, 0.08, 0.32]
        self._num_iters = args.num_iters

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs_flo = [output_dict[key] for key in ["flow2", "flow3", "flow4", "flow5", "flow6"]]

            # div_flow trick
            target_f = self._args.model_div_flow * target_dict["target1"]

            total_loss = 0
            for ii, output_ii in enumerate(outputs_flo):
                target_f_ii = _downsample2d_as(target_f, output_ii[0])
                for jj, output_ii_jj in enumerate(output_ii):
                    epe_f_ii = _elementwise_epe(output_ii_jj, target_f_ii)
                    total_loss = total_loss + self._weights[ii] * epe_f_ii.sum()
                    loss_dict["epe%i" % (ii + 2)] = epe_f_ii.mean()
            loss_dict["total_loss"] = total_loss / self._batch_size / self._num_iters

        else:
            output = output_dict["flow1"]
            target_f = target_dict["target1"]
            epe_f = _elementwise_epe(target_f, output)
            loss_dict["epe"] = epe_f.mean()

        return loss_dict

class MultiScaleEPE_FlowNet_IRR_Bi(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_FlowNet_IRR_Bi, self).__init__()
        self._args = args        
        self._batch_size = args.batch_size
        self._weights = [0.005, 0.01, 0.02, 0.08, 0.32]
        self._num_iters = args.num_iters

    def forward(self, output_dict, target_dict):

        loss_dict = {}

        if self.training:
            outputs_flo = [output_dict[key] for key in ["flow2", "flow3", "flow4", "flow5", "flow6"]]

            # div_flow trick
            target_f = self._args.model_div_flow * target_dict["target1"]
            target_b = self._args.model_div_flow * target_dict["target2"]

            total_loss = 0
            for ii, output_ii in enumerate(outputs_flo):
                target_f_ii = _downsample2d_as(target_f, output_ii[0][0])
                target_b_ii = _downsample2d_as(target_b, output_ii[0][1])
                for jj, output_ii_jj in enumerate(output_ii):
                    epe_f_ii = _elementwise_epe(output_ii_jj[0], target_f_ii)
                    epe_b_ii = _elementwise_epe(output_ii_jj[1], target_b_ii)
                    total_loss = total_loss + self._weights[ii] * (epe_f_ii.sum() + epe_b_ii.sum())
                    loss_dict["epe%i" % (ii + 2)] = (epe_f_ii.mean() + epe_b_ii.mean()) / 2
            loss_dict["total_loss"] = total_loss / self._batch_size / self._num_iters / 2
        else:
            epe_f = _elementwise_epe(output_dict["flow1"], target_dict["target1"])
            loss_dict["epe"] = epe_f.mean()

        return loss_dict

class MultiScaleEPE_FlowNet_IRR_Occ(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_FlowNet_IRR_Occ, self).__init__()
        self._args = args        
        self._batch_size = args.batch_size
        self._weights = [0.005, 0.01, 0.02, 0.08, 0.32]
        self._num_iters = args.num_iters

        self.f1_score_bal_loss = f1_score_bal_loss
        self.occ_activ = nn.Sigmoid()
        
    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs_flo = [output_dict[key] for key in ["flow2", "flow3", "flow4", "flow5", "flow6"]]
            outputs_occ = [output_dict[key] for key in ["occ2", "occ3", "occ4", "occ5", "occ6"]]

            # div_flow trick
            target = self._args.model_div_flow * target_dict["target1"]
            target_occ = target_dict["target_occ1"]

            flow_loss = 0
            occ_loss = 0

            for ii, output_ii in enumerate(outputs_flo):
                target_ii = _downsample2d_as(target, output_ii[0])
                for jj, output_ii_jj in enumerate(output_ii):
                    flow_loss = flow_loss + self._weights[ii] * _elementwise_epe(output_ii_jj, target_ii).sum()

            for ii, output_ii in enumerate(outputs_occ):
                target_occ_f = _downsample2d_as(target_occ, output_ii[0])
                for jj, output_ii_jj in enumerate(output_ii):
                    occ_loss = occ_loss + self._weights[ii] * self.f1_score_bal_loss(self.occ_activ(output_ii_jj), target_occ_f)

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / self._batch_size / self._num_iters
            loss_dict["occ_loss"] = occ_loss / self._batch_size / self._num_iters
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / self._batch_size / self._num_iters

        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow1"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ1"])))

        return loss_dict

class MultiScaleEPE_FlowNet_IRR_Bi_Occ(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_FlowNet_IRR_Bi_Occ, self).__init__()
        self._args = args        
        self._batch_size = args.batch_size
        self._weights = [0.005, 0.01, 0.02, 0.08, 0.32]
        self._num_iters = args.num_iters

        self.f1_score_bal_loss = f1_score_bal_loss
        self.occ_activ = nn.Sigmoid()

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs_flo = [output_dict[key] for key in ["flow2", "flow3", "flow4", "flow5", "flow6"]]
            outputs_occ = [output_dict[key] for key in ["occ2", "occ3", "occ4", "occ5", "occ6"]]

            # div_flow trick
            target_f = self._args.model_div_flow * target_dict["target1"]
            target_b = self._args.model_div_flow * target_dict["target2"]
            target_occ_f = target_dict["target_occ1"]
            target_occ_b = target_dict["target_occ2"]

            flow_loss = 0
            occ_loss = 0

            for ii, output_ii in enumerate(outputs_flo):
                target_f_ii = _downsample2d_as(target_f, output_ii[0][0])
                target_b_ii = _downsample2d_as(target_b, output_ii[0][1])
                for jj, output_ii_jj in enumerate(output_ii):
                    epe_f_ii = _elementwise_epe(output_ii_jj[0], target_f_ii)
                    epe_b_ii = _elementwise_epe(output_ii_jj[1], target_b_ii)
                    flow_loss = flow_loss + self._weights[ii] * (epe_f_ii.sum() + epe_b_ii.sum()) * 0.5

            for ii, output_ii in enumerate(outputs_occ):
                target_occ_f = _downsample2d_as(target_occ_f, output_ii[0][0])
                target_occ_b = _downsample2d_as(target_occ_b, output_ii[0][1])
                for jj, output_ii_jj in enumerate(output_ii):
                    output_occ_f = self.occ_activ(output_ii_jj[0])
                    output_occ_b = self.occ_activ(output_ii_jj[1])
                    bce_f_ii = self.f1_score_bal_loss(output_occ_f, target_occ_f)
                    bce_b_ii = self.f1_score_bal_loss(output_occ_b, target_occ_b)
                    occ_loss = occ_loss + self._weights[ii] * (bce_f_ii + bce_b_ii) * 0.5

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / self._batch_size / self._num_iters
            loss_dict["occ_loss"] = occ_loss / self._batch_size / self._num_iters
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / self._batch_size / self._num_iters
        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow1"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ1"])))

        return loss_dict

class MultiScaleEPE_FlowNet_IRR_Bi_Occ_upsample(nn.Module):
    def __init__(self,
                 args):
        super(MultiScaleEPE_FlowNet_IRR_Bi_Occ_upsample, self).__init__()
        self._args = args
        self._batch_size = args.batch_size        
        self._weights = [0.0003125, 0.00125, 0.005, 0.01, 0.02, 0.08, 0.32]
        
        self.occ_activ = nn.Sigmoid()
        self.f1_score_bal_loss = f1_score_bal_loss

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs_flo = [output_dict[key] for key in ["flow", "flow1", "flow2", "flow3", "flow4", "flow5", "flow6"]]
            outputs_occ = [output_dict[key] for key in ["occ", "occ1", "occ2", "occ3", "occ4", "occ5", "occ6"]]

            # div_flow trick
            target_f = self._args.model_div_flow * target_dict["target1"]
            target_b = self._args.model_div_flow * target_dict["target2"]
            target_occ_f = target_dict["target_occ1"]
            target_occ_b = target_dict["target_occ2"]

            num_iters = len(outputs_flo[0])
            flow_loss = 0
            occ_loss = 0

            for ii, output_ii in enumerate(outputs_flo):
                target_f_ii = _downsample2d_as(target_f, output_ii[0][0])
                target_b_ii = _downsample2d_as(target_b, output_ii[0][1])
                for jj, output_ii_jj in enumerate(output_ii):
                    epe_f_ii = _elementwise_epe(output_ii_jj[0], target_f_ii)
                    epe_b_ii = _elementwise_epe(output_ii_jj[1], target_b_ii)
                    flow_loss = flow_loss + self._weights[ii] * (epe_f_ii.sum() + epe_b_ii.sum()) * 0.5

            for ii, output_ii in enumerate(outputs_occ):
                target_occ_f = _downsample2d_as(target_occ_f, output_ii[0][0])
                target_occ_b = _downsample2d_as(target_occ_b, output_ii[0][1])
                for jj, output_ii_jj in enumerate(output_ii):
                    output_occ_f = self.occ_activ(output_ii_jj[0])
                    output_occ_b = self.occ_activ(output_ii_jj[1])
                    bce_f_ii = self.f1_score_bal_loss(output_occ_f, target_occ_f)
                    bce_b_ii = self.f1_score_bal_loss(output_occ_b, target_occ_b)
                    occ_loss = occ_loss + self._weights[ii] * (bce_f_ii + bce_b_ii) * 0.5

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / self._batch_size / num_iters
            loss_dict["occ_loss"] = occ_loss / self._batch_size / num_iters
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / self._batch_size / num_iters
        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ"])))

        return loss_dict


class MultiScaleEPE_PWC(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC, self).__init__()
        self._args = args
        self._batch_size = args.batch_size
        self._weights = [0.32, 0.08, 0.02, 0.01, 0.005]

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs = output_dict['flow']

            # div_flow trick
            target = self._args.model_div_flow * target_dict["target1"]

            total_loss = 0
            for ii, output_ii in enumerate(outputs):
                loss_ii = _elementwise_epe(output_ii, _downsample2d_as(target, output_ii)).sum()
                total_loss = total_loss + self._weights[ii] * loss_ii
            loss_dict["total_loss"] = total_loss / self._batch_size

        else:
            epe = _elementwise_epe(output_dict["flow"], target_dict["target1"])
            loss_dict["epe"] = epe.mean()

        return loss_dict

class MultiScaleEPE_PWC_Bi(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC_Bi, self).__init__()
        self._args = args
        self._batch_size = args.batch_size
        self._weights = [0.32, 0.08, 0.02, 0.01, 0.005]

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            outputs = output_dict['flow']

            # div_flow trick
            target_f = self._args.model_div_flow * target_dict["target1"]
            target_b = self._args.model_div_flow * target_dict["target2"]

            total_loss = 0
            for i, output_i in enumerate(outputs):
                epe_i_f = _elementwise_epe(output_i[0], _downsample2d_as(target_f, output_i[0]))
                epe_i_b = _elementwise_epe(output_i[1], _downsample2d_as(target_b, output_i[1]))
                total_loss = total_loss + self._weights[i] * (epe_i_f.sum() + epe_i_b.sum())
            loss_dict["total_loss"] = total_loss / (2 * self._batch_size)
        else:
            epe = _elementwise_epe(output_dict["flow"], target_dict["target1"])
            loss_dict["epe"] = epe.mean()

        return loss_dict

class MultiScaleEPE_PWC_Occ(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC_Occ, self).__init__()
        self._args = args
        self._batch_size = args.batch_size
        self._weights = [0.32, 0.08, 0.02, 0.01, 0.005]

        self.occ_activ = nn.Sigmoid()
        self.f1_score_bal_loss = f1_score_bal_loss

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            output_flo = output_dict['flow']
            output_occ = output_dict['occ']

            # div_flow trick
            target_flo = self._args.model_div_flow * target_dict["target1"]
            target_occ = target_dict["target_occ1"]

            flow_loss = 0
            occ_loss = 0

            for i, output_i in enumerate(output_flo):
                flow_loss = flow_loss + self._weights[i] * _elementwise_epe(output_i, _downsample2d_as(target_flo, output_i)).sum()

            for i, output_i in enumerate(output_occ):
                output_occ = self.occ_activ(output_i)
                occ_loss = occ_loss + self._weights[i] * self.f1_score_bal_loss(output_occ, _downsample2d_as(target_occ, output_occ))

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / self._batch_size
            loss_dict["occ_loss"] = occ_loss / self._batch_size
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / self._batch_size

        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ"])))

        return loss_dict

class MultiScaleEPE_PWC_Bi_Occ(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC_Bi_Occ, self).__init__()
        self._args = args
        self._batch_size = args.batch_size        
        self._weights = [0.32, 0.08, 0.02, 0.01, 0.005]

        self.occ_activ = nn.Sigmoid()
        self.f1_score_bal_loss = f1_score_bal_loss

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            output_flo = output_dict['flow']
            output_occ = output_dict['occ']

            # div_flow trick
            target_flo_f = self._args.model_div_flow * target_dict["target1"]
            target_flo_b = self._args.model_div_flow * target_dict["target2"]
            target_occ_f = target_dict["target_occ1"]
            target_occ_b = target_dict["target_occ2"]

            # bchw
            flow_loss = 0
            occ_loss = 0

            for i, output_i in enumerate(output_flo):
                flow_loss = flow_loss + self._weights[i] * _elementwise_epe(output_i[0], _downsample2d_as(target_flo_f, output_i[0])).sum()
                flow_loss = flow_loss + self._weights[i] * _elementwise_epe(output_i[1], _downsample2d_as(target_flo_b, output_i[1])).sum()

            for i, output_i in enumerate(output_occ):
                output_occ_f = self.occ_activ(output_i[0])
                output_occ_b = self.occ_activ(output_i[1])
                occ_loss = occ_loss + self._weights[i] * self.f1_score_bal_loss(output_occ_f, _downsample2d_as(target_occ_f, output_occ_f))
                occ_loss = occ_loss + self._weights[i] * self.f1_score_bal_loss(output_occ_b, _downsample2d_as(target_occ_b, output_occ_b))

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / (2 * self._batch_size)
            loss_dict["occ_loss"] = occ_loss / (2 * self._batch_size) 
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / (2 * self._batch_size)

        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ"])))

        return loss_dict

class MultiScaleEPE_PWC_Bi_Occ_upsample(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC_Bi_Occ_upsample, self).__init__()
        self._args = args
        self._batch_size = args.batch_size
        self._weights = [0.32, 0.08, 0.02, 0.01, 0.005, 0.00125, 0.0003125]

        self.occ_activ = nn.Sigmoid()
        self.f1_score_bal_loss = f1_score_bal_loss

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            output_flo = output_dict['flow']
            output_occ = output_dict['occ']

            # div_flow trick
            target_flo_f = self._args.model_div_flow * target_dict["target1"]
            target_flo_b = self._args.model_div_flow * target_dict["target2"]
            target_occ_f = target_dict["target_occ1"]
            target_occ_b = target_dict["target_occ2"]

            # bchw
            flow_loss = 0
            occ_loss = 0

            for ii, output_ii in enumerate(output_flo):
                loss_ii = 0
                for jj in range(0, len(output_ii) // 2):
                    loss_ii = loss_ii + _elementwise_epe(output_ii[2 * jj], _downsample2d_as(target_flo_f, output_ii[2 * jj])).sum()
                    loss_ii = loss_ii + _elementwise_epe(output_ii[2 * jj + 1], _downsample2d_as(target_flo_b, output_ii[2 * jj + 1])).sum()
                flow_loss = flow_loss + self._weights[ii] * loss_ii / len(output_ii)

            for ii, output_ii in enumerate(output_occ):
                loss_ii = 0
                for jj in range(0, len(output_ii) // 2):
                    output_occ_f = self.occ_activ(output_ii[2 * jj])
                    output_occ_b = self.occ_activ(output_ii[2 * jj + 1])
                    loss_ii = loss_ii + self.f1_score_bal_loss(output_occ_f, _downsample2d_as(target_occ_f, output_occ_f))
                    loss_ii = loss_ii + self.f1_score_bal_loss(output_occ_b, _downsample2d_as(target_occ_b, output_occ_b))
                occ_loss = occ_loss + self._weights[ii] * loss_ii / len(output_ii)

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / self._batch_size
            loss_dict["occ_loss"] = occ_loss / self._batch_size
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / self._batch_size

        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ"])))

        return loss_dict

class MultiScaleEPE_PWC_Bi_Occ_upsample_Sintel(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC_Bi_Occ_upsample_Sintel, self).__init__()
        self._args = args
        self._batch_size = args.batch_size        
        self._weights = [0.32, 0.08, 0.02, 0.01, 0.005, 0.00125, 0.0003125]

        self.occ_activ = nn.Sigmoid()
        self.occ_loss_bce = nn.BCELoss(reduction='sum')

    def forward(self, output_dict, target_dict):
        loss_dict = {}

        if self.training:
            output_flo = output_dict['flow']
            output_occ = output_dict['occ']

            # div_flow trick
            target_flo_f = self._args.model_div_flow * target_dict["target1"]
            target_occ_f = target_dict["target_occ1"]

            # bchw
            flow_loss = 0
            occ_loss = 0

            for ii, output_ii in enumerate(output_flo):
                loss_ii = 0
                for jj in range(0, len(output_ii) // 2):
                    loss_ii = loss_ii + _elementwise_robust_epe_char(output_ii[2 * jj], _downsample2d_as(target_flo_f, output_ii[2 * jj])).sum()
                    output_ii[2 * jj + 1] = output_ii[2 * jj + 1].detach()
                flow_loss = flow_loss + self._weights[ii] * loss_ii / len(output_ii) * 2

            for ii, output_ii in enumerate(output_occ):
                loss_ii = 0
                for jj in range(0, len(output_ii) // 2):
                    output_occ_f = self.occ_activ(output_ii[2 * jj])
                    output_ii[2 * jj + 1] = output_ii[2 * jj + 1].detach()
                    loss_ii = loss_ii + self.occ_loss_bce(output_occ_f, _downsample2d_as(target_occ_f, output_occ_f))
                occ_loss = occ_loss + self._weights[ii] * loss_ii / len(output_ii) * 2

            f_loss = flow_loss.detach()
            o_loss = occ_loss.detach()
            if f_loss > o_loss:
                f_l_w = 1
                o_l_w = f_loss / o_loss
            else:
                f_l_w = o_loss / f_loss
                o_l_w = 1

            loss_dict["flow_loss"] = flow_loss / self._batch_size
            loss_dict["occ_loss"] = occ_loss / self._batch_size
            loss_dict["total_loss"] = (flow_loss * f_l_w + occ_loss * o_l_w) / self._batch_size

        else:
            loss_dict["epe"] = _elementwise_epe(output_dict["flow"], target_dict["target1"]).mean()
            loss_dict["F1"] = f1_score(target_dict["target_occ1"], torch.round(self.occ_activ(output_dict["occ"])))

        return loss_dict

class MultiScaleEPE_PWC_Bi_Occ_upsample_KITTI(nn.Module):
    def __init__(self,
                 args):

        super(MultiScaleEPE_PWC_Bi_Occ_upsample_KITTI, self).__init__()
        self._args = args
        self._batch_size = args.batch_size
        self._weights = [0.001, 0.001, 0.001, 0.002, 0.004, 0.004, 0.004]

        self.occ_activ = nn.Sigmoid()
        
    def forward(self, output_dict, target_dict):
        loss_dict = {}

        valid_mask = target_dict["input_valid"]
        b, _, h, w = target_dict["target1"].size()

        if self.training:
            output_flo = output_dict['flow']
            output_occ = output_dict['occ']

            # div_flow trick
            target_flo_f = self._args.model_div_flow * target_dict["target1"]

            # bchw
            flow_loss = 0

            for ii, output_ii in enumerate(output_flo):
                loss_ii = 0
                for jj in range(0, len(output_ii) // 2):
                    valid_epe = _elementwise_robust_epe_char(_upsample2d_as(output_ii[2 * jj], target_flo_f), target_flo_f) * valid_mask

                    for bb in range(0, b):
                        valid_epe[bb, ...][valid_mask[bb, ...] == 0] = valid_epe[bb, ...][valid_mask[bb, ...] == 0].detach()
                        norm_const = h * w / (valid_mask[bb, ...].sum())
                        loss_ii = loss_ii + valid_epe[bb, ...][valid_mask[bb, ...] != 0].sum() * norm_const

                    output_ii[2 * jj + 1] = output_ii[2 * jj + 1].detach()
                flow_loss = flow_loss + self._weights[ii] * loss_ii / len(output_ii) * 2

            for ii, output_ii in enumerate(output_occ):
                for jj in range(0, len(output_ii) // 2):
                    output_ii[2 * jj] = output_ii[2 * jj].detach()
                    output_ii[2 * jj + 1] = output_ii[2 * jj + 1].detach()

            loss_dict["flow_loss"] = flow_loss / self._batch_size
            loss_dict["total_loss"] = flow_loss / self._batch_size

        else:
            flow_gt_mag = torch.norm(target_dict["target1"], p=2, dim=1, keepdim=True) + 1e-8
            flow_epe = _elementwise_epe(output_dict["flow"], target_dict["target1"]) * valid_mask

            epe_per_image = (flow_epe.view(b, -1).sum(1)) / (valid_mask.view(b, -1).sum(1))
            loss_dict["epe"] = epe_per_image.mean()

            outlier_epe = (flow_epe > 3).float() * ((flow_epe / flow_gt_mag) > 0.05).float() * valid_mask
            outlier_per_image = (outlier_epe.view(b, -1).sum(1)) / (valid_mask.view(b, -1).sum(1))
            loss_dict["outlier"] = outlier_per_image.mean()

        return loss_dict


================================================
FILE: main.py
================================================
from __future__ import absolute_import, division, print_function

import os
import subprocess
import commandline
import configuration as config
import runtime
import logger
import logging
import tools
import torch


def main():

    # Change working directory    
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    # Parse commandline arguments    
    args = commandline.setup_logging_and_parse_arguments(blocktitle="Commandline Arguments")

    # Set random seed, possibly on Cuda    
    config.configure_random_seed(args)    

    # DataLoader
    train_loader, validation_loader, inference_loader = config.configure_data_loaders(args)
    success = any(loader is not None for loader in [train_loader, validation_loader, inference_loader])
    if not success:
        logging.info("No dataset could be loaded successfully. Please check dataset paths!")
        quit()

    # Configure data augmentation
    training_augmentation, validation_augmentation = config.configure_runtime_augmentations(args)

    # Configure model and loss    
    model_and_loss = config.configure_model_and_loss(args)

    # Resume from checkpoint if available    
    checkpoint_saver, checkpoint_stats = config.configure_checkpoint_saver(args, model_and_loss)

    # Checkpoint and save directory    
    with logger.LoggingBlock("Save Directory", emph=True):
        logging.info("Save directory: %s" % args.save)
        if not os.path.exists(args.save):
            os.makedirs(args.save)

    # # Multi-GPU automation    
    # with logger.LoggingBlock("Multi GPU", emph=True):
    #     if torch.cuda.device_count() > 1:
    #         logging.info("Let's use %d GPUs!" % torch.cuda.device_count())
    #         model_and_loss._model = torch.nn.DataParallel(model_and_loss._model)
    #     else:
    #         logging.info("Let's use %d GPU!" % torch.cuda.device_count())

    
    # Configure optimizer    
    optimizer = config.configure_optimizer(args, model_and_loss)
    
    # Configure learning rate    
    lr_scheduler = config.configure_lr_scheduler(args, optimizer)

    # If this is just an evaluation: overwrite savers and epochs
    if args.evaluation:
        args.start_epoch = 1
        args.total_epochs = 1
        train_loader = None
        checkpoint_saver = None
        optimizer = None
        lr_scheduler = None

    # Cuda optimization    
    if args.cuda:
        torch.backends.cudnn.benchmark = True

    # Kickoff training, validation and/or testing    
    return runtime.exec_runtime(
        args,
        checkpoint_saver=checkpoint_saver,
        model_and_loss=model_and_loss,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        train_loader=train_loader,
        validation_loader=validation_loader,
        inference_loader=inference_loader,
        training_augmentation=training_augmentation,
        validation_augmentation=validation_augmentation)

if __name__ == "__main__":
    main()


================================================
FILE: models/IRR_FlowNet.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from .flownet_modules import conv, deconv
from .flownet_modules import concatenate_as, upsample2d_as
from .flownet_modules import initialize_msra
from .flownet_modules import WarpingLayer
from .irr_modules import OccUpsampleNetwork, RefineFlow, RefineOcc

class FlowNetS(nn.Module):
    def __init__(self, args):
        super(FlowNetS, self).__init__()

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv3_1 = make_conv( 256,  256, kernel_size=3, stride=1)
        self._conv4   = make_conv( 256,  512, kernel_size=3, stride=2)
        self._conv4_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv5   = make_conv( 512,  512, kernel_size=3, stride=2)
        self._conv5_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv6   = make_conv( 512, 1024, kernel_size=3, stride=2)
        self._conv6_1 = make_conv(1024, 1024, kernel_size=3, stride=1)

        def make_deconv(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=True, bias=False)

        self._deconv5 = make_deconv(1024    , 512)
        self._deconv4 = make_deconv(1024 + 2, 256)
        self._deconv3 = make_deconv( 768 + 2, 128)
        self._deconv2 = make_deconv( 384 + 2,  64)

        self._deconv_occ5 = make_deconv(1024    , 512)
        self._deconv_occ4 = make_deconv(1024 + 1, 256)
        self._deconv_occ3 = make_deconv( 768 + 1, 128)
        self._deconv_occ2 = make_deconv( 384 + 1,  64)

        def make_predict(in_planes, out_planes):
            return conv(in_planes, out_planes, kernel_size=3, stride=1, pad=1,
                        nonlinear=False, bias=True)

        self._predict_flow6 = make_predict(1024    , 2)
        self._predict_flow5 = make_predict(1024 + 2, 2)
        self._predict_flow4 = make_predict( 768 + 2, 2)
        self._predict_flow3 = make_predict( 384 + 2, 2)
        self._predict_flow2 = make_predict( 128 + 2, 2)

        self._predict_occ6 = make_predict(1024    , 1)
        self._predict_occ5 = make_predict(1024 + 1, 1)
        self._predict_occ4 = make_predict( 768 + 1, 1)
        self._predict_occ3 = make_predict( 384 + 1, 1)
        self._predict_occ2 = make_predict( 128 + 1, 1)

        def make_upsample(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=False, bias=False)

        self._upsample_flow6_to_5 = make_upsample(2, 2)
        self._upsample_flow5_to_4 = make_upsample(2, 2)
        self._upsample_flow4_to_3 = make_upsample(2, 2)
        self._upsample_flow3_to_2 = make_upsample(2, 2)

        self._upsample_occ6_to_5 = make_upsample(1, 1)
        self._upsample_occ5_to_4 = make_upsample(1, 1)
        self._upsample_occ4_to_3 = make_upsample(1, 1)
        self._upsample_occ3_to_2 = make_upsample(1, 1)

    def forward(self, conv2_im1, conv3_im1, conv3_im2):

        conv_concat3 = torch.cat((conv3_im1, conv3_im2), dim=1)

        conv3_1 = self._conv3_1(conv_concat3)
        conv4_1 = self._conv4_1(self._conv4(conv3_1))
        conv5_1 = self._conv5_1(self._conv5(conv4_1))
        conv6_1 = self._conv6_1(self._conv6(conv5_1))

        # Flow Decoder
        predict_flow6        = self._predict_flow6(conv6_1)

        upsampled_flow6_to_5 = self._upsample_flow6_to_5(predict_flow6)
        deconv5              = self._deconv5(conv6_1)
        concat5              = concatenate_as((conv5_1, deconv5, upsampled_flow6_to_5), conv5_1, dim=1)
        predict_flow5        = self._predict_flow5(concat5)

        upsampled_flow5_to_4 = self._upsample_flow5_to_4(predict_flow5)
        deconv4              = self._deconv4(concat5)
        concat4              = concatenate_as((conv4_1, deconv4, upsampled_flow5_to_4), conv4_1, dim=1)
        predict_flow4        = self._predict_flow4(concat4)

        upsampled_flow4_to_3 = self._upsample_flow4_to_3(predict_flow4)
        deconv3              = self._deconv3(concat4)
        concat3              = concatenate_as((conv3_1, deconv3, upsampled_flow4_to_3), conv3_1, dim=1)
        predict_flow3        = self._predict_flow3(concat3)

        upsampled_flow3_to_2 = self._upsample_flow3_to_2(predict_flow3)
        deconv2              = self._deconv2(concat3)
        concat2              = concatenate_as((conv2_im1, deconv2, upsampled_flow3_to_2), conv2_im1, dim=1)
        predict_flow2        = self._predict_flow2(concat2)

        # Occ Decoder
        predict_occ6 = self._predict_occ6(conv6_1)

        upsampled_occ6_to_5 = self._upsample_occ6_to_5(predict_occ6)
        deconv_occ5         = self._deconv_occ5(conv6_1)
        concat_occ5         = concatenate_as((conv5_1, deconv_occ5, upsampled_occ6_to_5), conv5_1, dim=1)
        predict_occ5        = self._predict_occ5(concat_occ5)

        upsampled_occ5_to_4 = self._upsample_occ5_to_4(predict_occ5)
        deconv_occ4         = self._deconv_occ4(concat_occ5)
        concat_occ4         = concatenate_as((conv4_1, deconv_occ4, upsampled_occ5_to_4), conv4_1, dim=1)
        predict_occ4        = self._predict_occ4(concat_occ4)

        upsampled_occ4_to_3 = self._upsample_occ4_to_3(predict_occ4)
        deconv_occ3         = self._deconv_occ3(concat_occ4)
        concat_occ3         = concatenate_as((conv3_1, deconv_occ3, upsampled_occ4_to_3), conv3_1, dim=1)
        predict_occ3        = self._predict_occ3(concat_occ3)

        upsampled_occ3_to_2 = self._upsample_occ3_to_2(predict_occ3)
        deconv_occ2         = self._deconv_occ2(concat_occ3)
        concat_occ2         = concatenate_as((conv2_im1, deconv_occ2, upsampled_occ3_to_2), conv2_im1, dim=1)
        predict_occ2        = self._predict_occ2(concat_occ2)

        return predict_flow2, predict_flow3, predict_flow4, predict_flow5, predict_flow6, predict_occ2, predict_occ3, predict_occ4, predict_occ5, predict_occ6


class FlowNet1S(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(FlowNet1S, self).__init__()
        self._flownets = FlowNetS(args)
        self._warping_layer = WarpingLayer()
        self._div_flow = div_flow
        self._num_iters = args.num_iters

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv1   = make_conv(   3,   32, kernel_size=7, stride=2)
        self._conv2   = make_conv(  32,   64, kernel_size=5, stride=2)
        self._conv3   = make_conv(  64,  128, kernel_size=5, stride=2)

        self.occ_shuffle_upsample = OccUpsampleNetwork(11, 1)
        self.refine_flow = RefineFlow(2 + 1 + 64)
        self.refine_occ = RefineOcc(1 + 64 + 64)

        initialize_msra(self.modules())

    def forward(self, input_dict):
        im1 = input_dict['input1']
        im2 = input_dict['input2']

        conv1_im1 = self._conv1(im1)
        conv2_im1 = self._conv2(conv1_im1)
        conv3_im1 = self._conv3(conv2_im1)
        conv3_im1_wp = conv3_im1

        conv1_im2 = self._conv1(im2)
        conv2_im2 = self._conv2(conv1_im2)
        conv3_im2 = self._conv3(conv2_im2)
        conv3_im2_wp = conv3_im2

        out_dict = {}
        out_dict['flow'] = []
        out_dict['flow1'] = []
        out_dict['flow2'] = []
        out_dict['flow3'] = []
        out_dict['flow4'] = []
        out_dict['flow5'] = []
        out_dict['flow6'] = []
        out_dict['occ'] = []
        out_dict['occ1'] = []
        out_dict['occ2'] = []
        out_dict['occ3'] = []
        out_dict['occ4'] = []
        out_dict['occ5'] = []
        out_dict['occ6'] = []

        # warping:
        _, _, height_im, width_im = im1.size()

        # for iterative
        for ii in range(0, self._num_iters):
            flo2_f, flo3_f, flo4_f, flo5_f, flo6_f, occ2_f, occ3_f, occ4_f, occ5_f, occ6_f = self._flownets(conv2_im1,
                                                                                                            conv3_im1,
                                                                                                            conv3_im2_wp)
            flo2_b, flo3_b, flo4_b, flo5_b, flo6_b, occ2_b, occ3_b, occ4_b, occ5_b, occ6_b = self._flownets(conv2_im2,
                                                                                                            conv3_im2,
                                                                                                            conv3_im1_wp)

            if ii == 0:
                out_dict['flow2'].append([flo2_f, flo2_b])
                out_dict['flow3'].append([flo3_f, flo3_b])
                out_dict['flow4'].append([flo4_f, flo4_b])
                out_dict['flow5'].append([flo5_f, flo5_b])
                out_dict['flow6'].append([flo6_f, flo6_b])
                out_dict['occ2'].append([occ2_f, occ2_b])
                out_dict['occ3'].append([occ3_f, occ3_b])
                out_dict['occ4'].append([occ4_f, occ4_b])
                out_dict['occ5'].append([occ5_f, occ5_b])
                out_dict['occ6'].append([occ6_f, occ6_b])
                flo2_f_out = flo2_f
                flo2_b_out = flo2_b
                occ2_f_out = occ2_f
                occ2_b_out = occ2_b
            else:
                out_dict['flow2'].append([flo2_f + out_dict['flow2'][ii - 1][0], flo2_b + out_dict['flow2'][ii - 1][1]])
                out_dict['flow3'].append([flo3_f + out_dict['flow3'][ii - 1][0], flo3_b + out_dict['flow3'][ii - 1][1]])
                out_dict['flow4'].append([flo4_f + out_dict['flow4'][ii - 1][0], flo4_b + out_dict['flow4'][ii - 1][1]])
                out_dict['flow5'].append([flo5_f + out_dict['flow5'][ii - 1][0], flo5_b + out_dict['flow5'][ii - 1][1]])
                out_dict['flow6'].append([flo6_f + out_dict['flow6'][ii - 1][0], flo6_b + out_dict['flow6'][ii - 1][1]])
                out_dict['occ2'].append([occ2_f + out_dict['occ2'][ii - 1][0], occ2_b + out_dict['occ2'][ii - 1][1]])
                out_dict['occ3'].append([occ3_f + out_dict['occ3'][ii - 1][0], occ3_b + out_dict['occ3'][ii - 1][1]])
                out_dict['occ4'].append([occ4_f + out_dict['occ4'][ii - 1][0], occ4_b + out_dict['occ4'][ii - 1][1]])
                out_dict['occ5'].append([occ5_f + out_dict['occ5'][ii - 1][0], occ5_b + out_dict['occ5'][ii - 1][1]])
                out_dict['occ6'].append([occ6_f + out_dict['occ6'][ii - 1][0], occ6_b + out_dict['occ6'][ii - 1][1]])
                flo2_f_out = flo2_f + upsample2d_as(out_dict['flow1'][ii - 1][0], flo2_f, mode="bilinear")
                flo2_b_out = flo2_b + upsample2d_as(out_dict['flow1'][ii - 1][1], flo2_b, mode="bilinear")
                occ2_f_out = occ2_f + upsample2d_as(out_dict['occ1'][ii - 1][0], occ2_f, mode="bilinear")
                occ2_b_out = occ2_b + upsample2d_as(out_dict['occ1'][ii - 1][1], occ2_b, mode="bilinear")

            ## refine layer
            flo2_f_out = upsample2d_as(flo2_f_out, conv2_im1, mode="bilinear")  
            flo2_b_out = upsample2d_as(flo2_b_out, conv2_im2, mode="bilinear") 
            occ2_f_out = upsample2d_as(occ2_f_out, conv2_im1, mode="bilinear")
            occ2_b_out = upsample2d_as(occ2_b_out, conv2_im2, mode="bilinear")

            img1_resize = upsample2d_as(im1, flo2_f_out, mode="bilinear")
            img2_resize = upsample2d_as(im2, flo2_b_out, mode="bilinear")
            img2_warp = self._warping_layer(img2_resize, flo2_f_out, height_im, width_im, self._div_flow)
            img1_warp = self._warping_layer(img1_resize, flo2_b_out, height_im, width_im, self._div_flow)

            # flow refine
            flow_f = self.refine_flow(flo2_f_out.detach(), img1_resize - img2_warp, conv2_im1)
            flow_b = self.refine_flow(flo2_b_out.detach(), img2_resize - img1_warp, conv2_im2)

            # occ refine
            conv2_im2_warp = self._warping_layer(conv2_im2, flow_f, height_im, width_im, self._div_flow)
            conv2_im1_warp = self._warping_layer(conv2_im1, flow_b, height_im, width_im, self._div_flow)

            occ_f = self.refine_occ(occ2_f_out.detach(), conv2_im1, conv2_im1 - conv2_im2_warp)
            occ_b = self.refine_occ(occ2_b_out.detach(), conv2_im2, conv2_im2 - conv2_im1_warp)
            out_dict['flow1'].append([flow_f, flow_b])
            out_dict['occ1'].append([occ_f, occ_b])

            ## upsample layer
            flow_f = upsample2d_as(flow_f, im1, mode="bilinear")
            flow_b = upsample2d_as(flow_b, im2, mode="bilinear")
            out_dict['flow'].append([flow_f, flow_b])

            im2_warp = self._warping_layer(im2, flow_f, height_im, width_im, self._div_flow)
            im1_warp = self._warping_layer(im1, flow_b, height_im, width_im, self._div_flow)
            flow_b_warp = self._warping_layer(flow_b, flow_f, height_im, width_im, self._div_flow)
            flow_f_warp = self._warping_layer(flow_f, flow_b, height_im, width_im, self._div_flow)

            occ_f = self.occ_shuffle_upsample(occ_f, torch.cat([im1, im2_warp, flow_f, flow_b_warp], dim=1))
            occ_b = self.occ_shuffle_upsample(occ_b, torch.cat([im2, im1_warp, flow_b, flow_f_warp], dim=1))

            out_dict['occ'].append([occ_f, occ_b])

            if ii < (self._num_iters - 1):
                flow_f_resized = upsample2d_as(flow_f, conv3_im2, mode="bilinear")
                flow_b_resized = upsample2d_as(flow_b, conv3_im1, mode="bilinear")
                conv3_im2_wp = self._warping_layer(conv3_im2, flow_f_resized, height_im, width_im, self._div_flow)
                conv3_im1_wp = self._warping_layer(conv3_im1, flow_b_resized, height_im, width_im, self._div_flow)
                
        if self.training:
            return out_dict
        else:
            out_dict_eval = {}
            out_dict_eval['flow'] = upsample2d_as(out_dict['flow'][self._num_iters - 1][0], im1, mode="bilinear") / self._div_flow
            out_dict_eval['occ'] = upsample2d_as(out_dict['occ'][self._num_iters - 1][0], im1, mode="bilinear")
            return out_dict_eval


================================================
FILE: models/IRR_PWC.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import conv, upsample2d_as, rescale_flow, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, ContextNetwork, FlowEstimatorDense, OccContextNetwork, OccEstimatorDense
from .irr_modules import OccUpsampleNetwork, RefineFlow, RefineOcc
import copy


class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.dim_corr = (self.search_range * 2 + 1) ** 2
        self.num_ch_in_flo = self.dim_corr + 32 + 2
        self.num_ch_in_occ = self.dim_corr + 32 + 1

        self.flow_estimators = FlowEstimatorDense(self.num_ch_in_flo)
        self.context_networks = ContextNetwork(self.num_ch_in_flo + 448 + 2)
        self.occ_estimators = OccEstimatorDense(self.num_ch_in_occ)
        self.occ_context_networks = OccContextNetwork(self.num_ch_in_occ + 448 + 1)
        self.occ_shuffle_upsample = OccUpsampleNetwork(11, 1)

        self.conv_1x1 = nn.ModuleList([conv(196, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(128, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(96, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(64, 32, kernel_size=1, stride=1, dilation=1)])

        self.conv_1x1_1 = conv(16, 3, kernel_size=1, stride=1, dilation=1)

        self.refine_flow = RefineFlow(2 + 1 + 32)
        self.refine_occ = RefineOcc(1 + 32 + 32)
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}

        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        batch_size, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        output_dict_eval = {}
        flows = []
        occs = []

        _, _, h_x1, w_x1, = x1_pyramid[0].size()
        flow_f = torch.zeros(batch_size, 2, h_x1, w_x1).float().cuda()
        flow_b = torch.zeros(batch_size, 2, h_x1, w_x1).float().cuda()
        occ_f = torch.zeros(batch_size, 1, h_x1, w_x1).float().cuda()
        occ_b = torch.zeros(batch_size, 1, h_x1, w_x1).float().cuda()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            if l <= self.output_level:

                # warping
                if l == 0:
                    x2_warp = x2
                    x1_warp = x1
                else:
                    flow_f = upsample2d_as(flow_f, x1, mode="bilinear")
                    flow_b = upsample2d_as(flow_b, x2, mode="bilinear")
                    occ_f = upsample2d_as(occ_f, x1, mode="bilinear")
                    occ_b = upsample2d_as(occ_b, x2, mode="bilinear")
                    x2_warp = self.warping_layer(x2, flow_f, height_im, width_im, self._div_flow)
                    x1_warp = self.warping_layer(x1, flow_b, height_im, width_im, self._div_flow)

                # correlation
                out_corr_f = compute_cost_volume(x1, x2_warp, self.corr_params)
                out_corr_b = compute_cost_volume(x2, x1_warp, self.corr_params)


                out_corr_relu_f = self.leakyRELU(out_corr_f)
                out_corr_relu_b = self.leakyRELU(out_corr_b)

                if l != self.output_level:
                    x1_1by1 = self.conv_1x1[l](x1)
                    x2_1by1 = self.conv_1x1[l](x2)
                else:
                    x1_1by1 = x1
                    x2_1by1 = x2

                # concat and estimate flow
                flow_f = rescale_flow(flow_f, self._div_flow, width_im, height_im, to_local=True)
                flow_b = rescale_flow(flow_b, self._div_flow, width_im, height_im, to_local=True)

                x_intm_f, flow_res_f = self.flow_estimators(torch.cat([out_corr_relu_f, x1_1by1, flow_f], dim=1))
                x_intm_b, flow_res_b = self.flow_estimators(torch.cat([out_corr_relu_b, x2_1by1, flow_b], dim=1))
                flow_est_f = flow_f + flow_res_f
                flow_est_b = flow_b + flow_res_b

                flow_cont_f = flow_est_f + self.context_networks(torch.cat([x_intm_f, flow_est_f], dim=1))
                flow_cont_b = flow_est_b + self.context_networks(torch.cat([x_intm_b, flow_est_b], dim=1))

                # occ estimation
                x_intm_occ_f, occ_res_f = self.occ_estimators(torch.cat([out_corr_relu_f, x1_1by1, occ_f], dim=1))
                x_intm_occ_b, occ_res_b = self.occ_estimators(torch.cat([out_corr_relu_b, x2_1by1, occ_b], dim=1))
                occ_est_f = occ_f + occ_res_f
                occ_est_b = occ_b + occ_res_b

                occ_cont_f = occ_est_f + self.occ_context_networks(torch.cat([x_intm_occ_f, occ_est_f], dim=1))
                occ_cont_b = occ_est_b + self.occ_context_networks(torch.cat([x_intm_occ_b, occ_est_b], dim=1))

                # refinement
                img1_resize = upsample2d_as(x1_raw, flow_f, mode="bilinear")
                img2_resize = upsample2d_as(x2_raw, flow_b, mode="bilinear")
                img2_warp = self.warping_layer(img2_resize, rescale_flow(flow_cont_f, self._div_flow, width_im, height_im, to_local=False), height_im, width_im, self._div_flow)
                img1_warp = self.warping_layer(img1_resize, rescale_flow(flow_cont_b, self._div_flow, width_im, height_im, to_local=False), height_im, width_im, self._div_flow)

                # flow refine
                flow_f = self.refine_flow(flow_cont_f.detach(), img1_resize - img2_warp, x1_1by1)
                flow_b = self.refine_flow(flow_cont_b.detach(), img2_resize - img1_warp, x2_1by1)

                flow_cont_f = rescale_flow(flow_cont_f, self._div_flow, width_im, height_im, to_local=False)
                flow_cont_b = rescale_flow(flow_cont_b, self._div_flow, width_im, height_im, to_local=False)
                flow_f = rescale_flow(flow_f, self._div_flow, width_im, height_im, to_local=False)
                flow_b = rescale_flow(flow_b, self._div_flow, width_im, height_im, to_local=False)

                # occ refine
                x2_1by1_warp = self.warping_layer(x2_1by1, flow_f, height_im, width_im, self._div_flow)
                x1_1by1_warp = self.warping_layer(x1_1by1, flow_b, height_im, width_im, self._div_flow)

                occ_f = self.refine_occ(occ_cont_f.detach(), x1_1by1, x1_1by1 - x2_1by1_warp)
                occ_b = self.refine_occ(occ_cont_b.detach(), x2_1by1, x2_1by1 - x1_1by1_warp)

                flows.append([flow_cont_f, flow_cont_b, flow_f, flow_b])
                occs.append([occ_cont_f, occ_cont_b, occ_f, occ_b])

            else:
                flow_f = upsample2d_as(flow_f, x1, mode="bilinear")
                flow_b = upsample2d_as(flow_b, x2, mode="bilinear")
                flows.append([flow_f, flow_b])

                x2_warp = self.warping_layer(x2, flow_f, height_im, width_im, self._div_flow)
                x1_warp = self.warping_layer(x1, flow_b, height_im, width_im, self._div_flow)
                flow_b_warp = self.warping_layer(flow_b, flow_f, height_im, width_im, self._div_flow)
                flow_f_warp = self.warping_layer(flow_f, flow_b, height_im, width_im, self._div_flow)

                if l != self.num_levels-1:
                    x1_in = self.conv_1x1_1(x1)
                    x2_in = self.conv_1x1_1(x2)
                    x1_w_in = self.conv_1x1_1(x1_warp)
                    x2_w_in = self.conv_1x1_1(x2_warp)
                else:
                    x1_in = x1
                    x2_in = x2
                    x1_w_in = x1_warp
                    x2_w_in = x2_warp

                occ_f = self.occ_shuffle_upsample(occ_f, torch.cat([x1_in, x2_w_in, flow_f, flow_b_warp], dim=1))
                occ_b = self.occ_shuffle_upsample(occ_b, torch.cat([x2_in, x1_w_in, flow_b, flow_f_warp], dim=1))

                occs.append([occ_f, occ_b])

        output_dict_eval['flow'] = upsample2d_as(flow_f, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
        output_dict_eval['occ'] = upsample2d_as(occ_f, x1_raw, mode="bilinear")
        output_dict['flow'] = flows
        output_dict['occ'] = occs

        if self.training:
            return output_dict
        else:
            return output_dict_eval


================================================
FILE: models/__init__.py
================================================
from . import flownet1s
from . import flownet1s_irr
from . import flownet1s_irr_bi
from . import flownet1s_irr_occ
from . import flownet1s_irr_occ_bi
from . import IRR_FlowNet

from . import pwcnet
from . import pwcnet_bi
from . import pwcnet_occ
from . import pwcnet_occ_bi
from . import pwcnet_irr
from . import pwcnet_irr_bi
from . import pwcnet_irr_occ
from . import pwcnet_irr_occ_bi
from . import IRR_PWC


FlowNet1S            = flownet1s.FlowNet1S
FlowNet1S_irr        = flownet1s_irr.FlowNet1S
FlowNet1S_irr_bi     = flownet1s_irr_bi.FlowNet1S
FlowNet1S_irr_occ    = flownet1s_irr_occ.FlowNet1S
FlowNet1S_irr_occ_bi = flownet1s_irr_occ_bi.FlowNet1S

PWCNet               = pwcnet.PWCNet
PWCNet_bi            = pwcnet_bi.PWCNet
PWCNet_occ           = pwcnet_occ.PWCNet
PWCNet_occ_bi        = pwcnet_occ_bi.PWCNet
PWCNet_irr           = pwcnet_irr.PWCNet
PWCNet_irr_bi        = pwcnet_irr_bi.PWCNet
PWCNet_irr_occ       = pwcnet_irr_occ.PWCNet
PWCNet_irr_occ_bi    = pwcnet_irr_occ_bi.PWCNet

IRR_FlowNet          = IRR_FlowNet.FlowNet1S
IRR_PWC              = IRR_PWC.PWCNet


================================================
FILE: models/correlation_package/__init__.py
================================================


================================================
FILE: models/correlation_package/correlation.py
================================================
import torch
from torch.nn.modules.module import Module
from torch.autograd import Function
import correlation_cuda

class CorrelationFunction(Function):

    def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
        super(CorrelationFunction, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply
        # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1)

    def forward(self, input1, input2):
        self.save_for_backward(input1, input2)

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()
            output = input1.new()

            correlation_cuda.forward(input1, input2, rbot1, rbot2, output, 
                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)

        return output

    def backward(self, grad_output):
        input1, input2 = self.saved_tensors

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()

            grad_input1 = input1.new()
            grad_input2 = input2.new()

            correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,
                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)

        return grad_input1, grad_input2


class Correlation(Module):
    def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):
        super(Correlation, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply

    def forward(self, input1, input2):

        result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement, self.stride1, self.stride2, self.corr_multiply)(input1, input2)

        return result


================================================
FILE: models/correlation_package/correlation_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <stdio.h>
#include <iostream>

#include "correlation_cuda_kernel.cuh"

int correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);

  int nInputChannels = input1.size(1);
  int inputHeight = input1.size(2);
  int inputWidth = input1.size(3);

  int kernel_radius = (kernel_size - 1) / 2;
  int border_radius = kernel_radius + max_displacement;

  int paddedInputHeight = inputHeight + 2 * pad_size;
  int paddedInputWidth = inputWidth + 2 * pad_size;

  int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1);

  int outputHeight = ceil(static_cast<float>(paddedInputHeight - 2 * border_radius) / static_cast<float>(stride1));
  int outputwidth = ceil(static_cast<float>(paddedInputWidth - 2 * border_radius) / static_cast<float>(stride1));

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth});

  rInput1.fill_(0);
  rInput2.fill_(0);
  output.fill_(0);

  int success = correlation_forward_cuda_kernel(
    output,
    output.size(0), 
    output.size(1),
    output.size(2),
    output.size(3),
    output.stride(0),
    output.stride(1),
    output.stride(2),
    output.stride(3),
    input1,
    input1.size(1),
    input1.size(2),
    input1.size(3),
    input1.stride(0),
    input1.stride(1),
    input1.stride(2),
    input1.stride(3),
    input2,
    input2.size(1),
    input2.stride(0),
    input2.stride(1),
    input2.stride(2),
    input2.stride(3),
    rInput1,
    rInput2,
    pad_size,     
    kernel_size,
    max_displacement,
    stride1,
    stride2,
    corr_type_multiply,
    at::globalContext().getCurrentCUDAStream()
  );

  //check for errors
  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;

}

int correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput, 
                       at::Tensor& gradInput1, at::Tensor& gradInput2,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);
  int nInputChannels = input1.size(1);
  int paddedInputHeight = input1.size(2)+ 2 * pad_size;
  int paddedInputWidth = input1.size(3)+ 2 * pad_size;

  int height = input1.size(2);
  int width = input1.size(3);

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  gradInput1.resize_({batchSize, nInputChannels, height, width});
  gradInput2.resize_({batchSize, nInputChannels, height, width});

  rInput1.fill_(0);
  rInput2.fill_(0);
  gradInput1.fill_(0);
  gradInput2.fill_(0);

  int success = correlation_backward_cuda_kernel(gradOutput,
                                                gradOutput.size(0),
                                                gradOutput.size(1),
                                                gradOutput.size(2),
                                                gradOutput.size(3),
                                                gradOutput.stride(0),
                                                gradOutput.stride(1),
                                                gradOutput.stride(2),
                                                gradOutput.stride(3),
                                                input1,
                                                input1.size(1),
                                                input1.size(2),
                                                input1.size(3),
                                                input1.stride(0),
                                                input1.stride(1),
                                                input1.stride(2),
                                                input1.stride(3),
                                                input2,  
                                                input2.stride(0),
                                                input2.stride(1),
                                                input2.stride(2),
                                                input2.stride(3),
                                                gradInput1,
                                                gradInput1.stride(0),
                                                gradInput1.stride(1),
                                                gradInput1.stride(2),
                                                gradInput1.stride(3),
                                                gradInput2,
                                                gradInput2.size(1),
                                                gradInput2.stride(0),
                                                gradInput2.stride(1),
                                                gradInput2.stride(2),
                                                gradInput2.stride(3),
                                                rInput1,
                                                rInput2,
                                                pad_size,
                                                kernel_size,
                                                max_displacement,
                                                stride1, 
                                                stride2,
                                                corr_type_multiply,
                                                at::globalContext().getCurrentCUDAStream()
                                               );

  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &correlation_forward_cuda, "Correlation forward (CUDA)");
  m.def("backward", &correlation_backward_cuda, "Correlation backward (CUDA)");
}


================================================
FILE: models/correlation_package/correlation_cuda_kernel.cu
================================================
#include <stdio.h>

#include "correlation_cuda_kernel.cuh"

#define CUDA_NUM_THREADS 1024
#define THREADS_PER_BLOCK 32

#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

using at::Half;

template <typename scalar_t>
__global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size)
{

	// n (batch size), c (num of channels), y (height), x (width)
	int n = blockIdx.x;
	int y = blockIdx.y;
	int x = blockIdx.z;

	int ch_off = threadIdx.x;
	scalar_t value;

	int dimcyx = channels * height * width;
	int dimyx = height * width;

	int p_dimx = (width + 2 * pad_size);
	int p_dimy = (height + 2 * pad_size);
	int p_dimyxc = channels * p_dimy * p_dimx;
	int p_dimxc = p_dimx * channels;

	for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) {
		value = input[n * dimcyx + c * dimyx + y * width + x];
		rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value;
	}
}

template <typename scalar_t>
__global__ void correlation_forward(scalar_t*  output, int nOutputChannels, int outputHeight, int outputWidth,
	const scalar_t* __restrict__ rInput1, int nInputChannels, int inputHeight, int inputWidth,
	const scalar_t* __restrict__ rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2)
{
	// n (batch size), c (num of channels), y (height), x (width)

	int pInputWidth = inputWidth + 2 * pad_size;
	int pInputHeight = inputHeight + 2 * pad_size;

	int kernel_rad = (kernel_size - 1) / 2;
	int displacement_rad = max_displacement / stride2;
	int displacement_size = 2 * displacement_rad + 1;

	int n = blockIdx.x;
	int y1 = blockIdx.y * stride1 + max_displacement;
	int x1 = blockIdx.z * stride1 + max_displacement;
	int c = threadIdx.x;

	int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
	int pdimxc = pInputWidth * nInputChannels;
	int pdimc = nInputChannels;

	int tdimcyx = nOutputChannels * outputHeight * outputWidth;
	int tdimyx = outputHeight * outputWidth;
	int tdimx = outputWidth;

	scalar_t nelems = kernel_size * kernel_size * pdimc;

	__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];

	// no significant speed-up in using chip memory for input1 sub-data, 
	// not enough chip memory size to accomodate memory per block for input2 sub-data
	// instead i've used device memory for both 

	// element-wise product along channel axis
	for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
		for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
			prod_sum[c] = 0;
			int x2 = x1 + ti*stride2;
			int y2 = y1 + tj*stride2;

			for (int j = -kernel_rad; j <= kernel_rad; ++j) {
				for (int i = -kernel_rad; i <= kernel_rad; ++i) {
					for (int ch = c; ch < pdimc; ch += THREADS_PER_BLOCK) {
						int indx1 = n * pdimyxc + (y1 + j) * pdimxc + (x1 + i) * pdimc + ch;
						int indx2 = n * pdimyxc + (y2 + j) * pdimxc + (x2 + i) * pdimc + ch;

						prod_sum[c] += rInput1[indx1] * rInput2[indx2];
					}
				}
			}

			// accumulate 
			__syncthreads();
			if (c == 0) {
				scalar_t reduce_sum = 0;
				for (int index = 0; index < THREADS_PER_BLOCK; ++index) {
					reduce_sum += prod_sum[index];
				}
				int tc = (tj + displacement_rad) * displacement_size + (ti + displacement_rad);
				const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx + blockIdx.z;
				output[tindx] = reduce_sum / nelems;
			}

		}
	}

}

template <typename scalar_t>
__global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth,
	const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
	const scalar_t* __restrict__ rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2)
{
	// n (batch size), c (num of channels), y (height), x (width)

	int n = item;
	int y = blockIdx.x * stride1 + pad_size;
	int x = blockIdx.y * stride1 + pad_size;
	int c = blockIdx.z;
	int tch_off = threadIdx.x;

	int kernel_rad = (kernel_size - 1) / 2;
	int displacement_rad = max_displacement / stride2;
	int displacement_size = 2 * displacement_rad + 1;

	int xmin = (x - kernel_rad - max_displacement) / stride1;
	int ymin = (y - kernel_rad - max_displacement) / stride1;

	int xmax = (x + kernel_rad - max_displacement) / stride1;
	int ymax = (y + kernel_rad - max_displacement) / stride1;

	if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
		// assumes gradInput1 is pre-allocated and zero filled
		return;
	}

	if (xmin > xmax || ymin > ymax) {
		// assumes gradInput1 is pre-allocated and zero filled
		return;
	}

	xmin = max(0, xmin);
	xmax = min(outputWidth - 1, xmax);

	ymin = max(0, ymin);
	ymax = min(outputHeight - 1, ymax);

	int pInputWidth = inputWidth + 2 * pad_size;
	int pInputHeight = inputHeight + 2 * pad_size;

	int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
	int pdimxc = pInputWidth * nInputChannels;
	int pdimc = nInputChannels;

	int tdimcyx = nOutputChannels * outputHeight * outputWidth;
	int tdimyx = outputHeight * outputWidth;
	int tdimx = outputWidth;

	int odimcyx = nInputChannels * inputHeight* inputWidth;
	int odimyx = inputHeight * inputWidth;
	int odimx = inputWidth;

	scalar_t nelems = kernel_size * kernel_size * nInputChannels;

	__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
	prod_sum[tch_off] = 0;

	for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {

		int i2 = (tc % displacement_size - displacement_rad) * stride2;
		int j2 = (tc / displacement_size - displacement_rad) * stride2;

		int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c;

		scalar_t val2 = rInput2[indx2];

		for (int j = ymin; j <= ymax; ++j) {
			for (int i = xmin; i <= xmax; ++i) {
				int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
				prod_sum[tch_off] += gradOutput[tindx] * val2;
			}
		}
	}
	__syncthreads();

	if (tch_off == 0) {
		scalar_t reduce_sum = 0;
		for (int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
			reduce_sum += prod_sum[idx];
		}
		const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
		gradInput1[indx1] = reduce_sum / nelems;
	}

}

template <typename scalar_t>
__global__ void correlation_backward_input2(int item, scalar_t*  gradInput2, int nInputChannels, int inputHeight, int inputWidth,
	const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
	const scalar_t* __restrict__ rInput1,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2)
{
	// n (batch size), c (num of channels), y (height), x (width)

	int n = item;
	int y = blockIdx.x * stride1 + pad_size;
	int x = blockIdx.y * stride1 + pad_size;
	int c = blockIdx.z;

	int tch_off = threadIdx.x;

	int kernel_rad = (kernel_size - 1) / 2;
	int displacement_rad = max_displacement / stride2;
	int displacement_size = 2 * displacement_rad + 1;

	int pInputWidth = inputWidth + 2 * pad_size;
	int pInputHeight = inputHeight + 2 * pad_size;

	int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
	int pdimxc = pInputWidth * nInputChannels;
	int pdimc = nInputChannels;

	int tdimcyx = nOutputChannels * outputHeight * outputWidth;
	int tdimyx = outputHeight * outputWidth;
	int tdimx = outputWidth;

	int odimcyx = nInputChannels * inputHeight* inputWidth;
	int odimyx = inputHeight * inputWidth;
	int odimx = inputWidth;

	scalar_t nelems = kernel_size * kernel_size * nInputChannels;

	__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
	prod_sum[tch_off] = 0;

	for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {
		int i2 = (tc % displacement_size - displacement_rad) * stride2;
		int j2 = (tc / displacement_size - displacement_rad) * stride2;

		int xmin = (x - kernel_rad - max_displacement - i2) / stride1;
		int ymin = (y - kernel_rad - max_displacement - j2) / stride1;

		int xmax = (x + kernel_rad - max_displacement - i2) / stride1;
		int ymax = (y + kernel_rad - max_displacement - j2) / stride1;

		if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
			// assumes gradInput2 is pre-allocated and zero filled
			continue;
		}

		if (xmin > xmax || ymin > ymax) {
			// assumes gradInput2 is pre-allocated and zero filled
			continue;
		}

		xmin = max(0, xmin);
		xmax = min(outputWidth - 1, xmax);

		ymin = max(0, ymin);
		ymax = min(outputHeight - 1, ymax);

		int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c;
		scalar_t val1 = rInput1[indx1];

		for (int j = ymin; j <= ymax; ++j) {
			for (int i = xmin; i <= xmax; ++i) {
				int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
				prod_sum[tch_off] += gradOutput[tindx] * val1;
			}
		}
	}

	__syncthreads();

	if (tch_off == 0) {
		scalar_t reduce_sum = 0;
		for (int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
			reduce_sum += prod_sum[idx];
		}
		const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
		gradInput2[indx2] = reduce_sum / nelems;
	}

}

int correlation_forward_cuda_kernel(at::Tensor& output,
	int ob,
	int oc,
	int oh,
	int ow,
	int osb,
	int osc,
	int osh,
	int osw,

	at::Tensor& input1,
	int ic,
	int ih,
	int iw,
	int isb,
	int isc,
	int ish,
	int isw,

	at::Tensor& input2,
	int gc,
	int gsb,
	int gsc,
	int gsh,
	int gsw,

	at::Tensor& rInput1,
	at::Tensor& rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2,
	int corr_type_multiply,
	cudaStream_t stream)
{

	int batchSize = ob;

	int nInputChannels = ic;
	int inputWidth = iw;
	int inputHeight = ih;

	int nOutputChannels = oc;
	int outputWidth = ow;
	int outputHeight = oh;

	dim3 blocks_grid(batchSize, inputHeight, inputWidth);
	dim3 threads_block(THREADS_PER_BLOCK);

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channels_first_fwd_1", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> >(
			input1.data<scalar_t>(), rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

	}));

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "channels_first_fwd_2", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> > (
			input2.data<scalar_t>(), rInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

	}));

	dim3 threadsPerBlock(THREADS_PER_BLOCK);
	dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth);

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "correlation_forward", ([&] {

		correlation_forward<scalar_t> << <totalBlocksCorr, threadsPerBlock, 0, stream >> >
			(output.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
			rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
			rInput2.data<scalar_t>(),
			pad_size,
			kernel_size,
			max_displacement,
			stride1,
			stride2);

	}));

	cudaError_t err = cudaGetLastError();


	// check for errors
	if (err != cudaSuccess) {
		printf("error in correlation_forward_cuda_kernel: %s\n", cudaGetErrorString(err));
		return 0;
	}

	return 1;
}


int correlation_backward_cuda_kernel(
	at::Tensor& gradOutput,
	int gob,
	int goc,
	int goh,
	int gow,
	int gosb,
	int gosc,
	int gosh,
	int gosw,

	at::Tensor& input1,
	int ic,
	int ih,
	int iw,
	int isb,
	int isc,
	int ish,
	int isw,

	at::Tensor& input2,
	int gsb,
	int gsc,
	int gsh,
	int gsw,

	at::Tensor& gradInput1,
	int gisb,
	int gisc,
	int gish,
	int gisw,

	at::Tensor& gradInput2,
	int ggc,
	int ggsb,
	int ggsc,
	int ggsh,
	int ggsw,

	at::Tensor& rInput1,
	at::Tensor& rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2,
	int corr_type_multiply,
	cudaStream_t stream)
{

	int batchSize = gob;
	int num = batchSize;

	int nInputChannels = ic;
	int inputWidth = iw;
	int inputHeight = ih;

	int nOutputChannels = goc;
	int outputWidth = gow;
	int outputHeight = goh;

	dim3 blocks_grid(batchSize, inputHeight, inputWidth);
	dim3 threads_block(THREADS_PER_BLOCK);


	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "lltm_forward_cuda", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> >(
			input1.data<scalar_t>(),
			rInput1.data<scalar_t>(),
			nInputChannels,
			inputHeight,
			inputWidth,
			pad_size
			);
	}));

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> >(
			input2.data<scalar_t>(),
			rInput2.data<scalar_t>(),
			nInputChannels,
			inputHeight,
			inputWidth,
			pad_size
			);
	}));

	dim3 threadsPerBlock(THREADS_PER_BLOCK);
	dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels);

	for (int n = 0; n < num; ++n) {

		AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {


			correlation_backward_input1<scalar_t> << <totalBlocksCorr, threadsPerBlock, 0, stream >> > (
				n, gradInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
				gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
				rInput2.data<scalar_t>(),
				pad_size,
				kernel_size,
				max_displacement,
				stride1,
				stride2);
		}));
	}

	for (int n = 0; n < batchSize; n++) {

		AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), "lltm_forward_cuda", ([&] {

			correlation_backward_input2<scalar_t> << <totalBlocksCorr, threadsPerBlock, 0, stream >> >(
				n, gradInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
				gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
				rInput1.data<scalar_t>(),
				pad_size,
				kernel_size,
				max_displacement,
				stride1,
				stride2);

		}));
	}

	// check for errors
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("error in correlation_backward_cuda_kernel: %s\n", cudaGetErrorString(err));
		return 0;
	}

	return 1;
}


================================================
FILE: models/correlation_package/correlation_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int correlation_forward_cuda_kernel(at::Tensor& output,
    int ob,
    int oc,
    int oh,
    int ow,
    int osb,
    int osc,
    int osh,
    int osw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gc,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


int correlation_backward_cuda_kernel(   
    at::Tensor& gradOutput,
    int gob,
    int goc,
    int goh,
    int gow,
    int gosb,
    int gosc,
    int gosh,
    int gosw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& gradInput1, 
    int gisb,
    int gisc,
    int gish,
    int gisw,

    at::Tensor& gradInput2,
    int ggc,
    int ggsb,
    int ggsc,
    int ggsh,
    int ggsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


================================================
FILE: models/correlation_package/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

cxx_args = ['-std=c++11']

nvcc_args = [
    '-gencode', 'arch=compute_50,code=sm_50',
    '-gencode', 'arch=compute_52,code=sm_52',
    '-gencode', 'arch=compute_60,code=sm_60',
    '-gencode', 'arch=compute_61,code=sm_61',
    '-gencode', 'arch=compute_61,code=compute_61'
]

setup(
    name='correlation_cuda',
    ext_modules=[
        CUDAExtension('correlation_cuda', [
            'correlation_cuda.cc',
            'correlation_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: models/correlation_package_cu9/__init__.py
================================================


================================================
FILE: models/correlation_package_cu9/correlation.py
================================================
import torch
from torch.nn.modules.module import Module
from torch.autograd import Function
import correlation_cuda

class CorrelationFunction(Function):

    def __init__(self, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
        super(CorrelationFunction, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply
        # self.out_channel = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1)

    def forward(self, input1, input2):
        self.save_for_backward(input1, input2)

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()
            output = input1.new()

            correlation_cuda.forward(input1, input2, rbot1, rbot2, output, 
                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)

        return output

    def backward(self, grad_output):
        input1, input2 = self.saved_tensors

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()

            grad_input1 = input1.new()
            grad_input2 = input2.new()

            correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,
                self.pad_size, self.kernel_size, self.max_displacement,self.stride1, self.stride2, self.corr_multiply)

        return grad_input1, grad_input2


class Correlation(Module):
    def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):
        super(Correlation, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply

    def forward(self, input1, input2):

        result = CorrelationFunction(self.pad_size, self.kernel_size, self.max_displacement, self.stride1, self.stride2, self.corr_multiply)(input1, input2)

        return result


================================================
FILE: models/correlation_package_cu9/correlation_cuda.cc
================================================
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <ATen/cuda/CUDAContext.h>
#include <stdio.h>
#include <iostream>

#include "correlation_cuda_kernel.cuh"

int correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);

  int nInputChannels = input1.size(1);
  int inputHeight = input1.size(2);
  int inputWidth = input1.size(3);

  int kernel_radius = (kernel_size - 1) / 2;
  int border_radius = kernel_radius + max_displacement;

  int paddedInputHeight = inputHeight + 2 * pad_size;
  int paddedInputWidth = inputWidth + 2 * pad_size;

  int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1);

  int outputHeight = ceil(static_cast<float>(paddedInputHeight - 2 * border_radius) / static_cast<float>(stride1));
  int outputwidth = ceil(static_cast<float>(paddedInputWidth - 2 * border_radius) / static_cast<float>(stride1));

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth});

  rInput1.fill_(0);
  rInput2.fill_(0);
  output.fill_(0);

  int success = correlation_forward_cuda_kernel(
    output,
    output.size(0), 
    output.size(1),
    output.size(2),
    output.size(3),
    output.stride(0),
    output.stride(1),
    output.stride(2),
    output.stride(3),
    input1,
    input1.size(1),
    input1.size(2),
    input1.size(3),
    input1.stride(0),
    input1.stride(1),
    input1.stride(2),
    input1.stride(3),
    input2,
    input2.size(1),
    input2.stride(0),
    input2.stride(1),
    input2.stride(2),
    input2.stride(3),
    rInput1,
    rInput2,
    pad_size,     
    kernel_size,
    max_displacement,
    stride1,
    stride2,
    corr_type_multiply,
  at::cuda::getCurrentCUDAStream()
  //at::globalContext().getCurrentCUDAStream()
  );

  //check for errors
  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;

}

int correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput, 
                       at::Tensor& gradInput1, at::Tensor& gradInput2,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);
  int nInputChannels = input1.size(1);
  int paddedInputHeight = input1.size(2)+ 2 * pad_size;
  int paddedInputWidth = input1.size(3)+ 2 * pad_size;

  int height = input1.size(2);
  int width = input1.size(3);

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  gradInput1.resize_({batchSize, nInputChannels, height, width});
  gradInput2.resize_({batchSize, nInputChannels, height, width});

  rInput1.fill_(0);
  rInput2.fill_(0);
  gradInput1.fill_(0);
  gradInput2.fill_(0);

  int success = correlation_backward_cuda_kernel(gradOutput,
                                                gradOutput.size(0),
                                                gradOutput.size(1),
                                                gradOutput.size(2),
                                                gradOutput.size(3),
                                                gradOutput.stride(0),
                                                gradOutput.stride(1),
                                                gradOutput.stride(2),
                                                gradOutput.stride(3),
                                                input1,
                                                input1.size(1),
                                                input1.size(2),
                                                input1.size(3),
                                                input1.stride(0),
                                                input1.stride(1),
                                                input1.stride(2),
                                                input1.stride(3),
                                                input2,  
                                                input2.stride(0),
                                                input2.stride(1),
                                                input2.stride(2),
                                                input2.stride(3),
                                                gradInput1,
                                                gradInput1.stride(0),
                                                gradInput1.stride(1),
                                                gradInput1.stride(2),
                                                gradInput1.stride(3),
                                                gradInput2,
                                                gradInput2.size(1),
                                                gradInput2.stride(0),
                                                gradInput2.stride(1),
                                                gradInput2.stride(2),
                                                gradInput2.stride(3),
                                                rInput1,
                                                rInput2,
                                                pad_size,
                                                kernel_size,
                                                max_displacement,
                                                stride1, 
                                                stride2,
                                                corr_type_multiply,
                        at::cuda::getCurrentCUDAStream()
                                                //at::globalContext().getCurrentCUDAStream()
                                               );

  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &correlation_forward_cuda, "Correlation forward (CUDA)");
  m.def("backward", &correlation_backward_cuda, "Correlation backward (CUDA)");
}


================================================
FILE: models/correlation_package_cu9/correlation_cuda_kernel.cu
================================================
#include <stdio.h>

#include "correlation_cuda_kernel.cuh"

#define CUDA_NUM_THREADS 1024
#define THREADS_PER_BLOCK 32

#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

using at::Half;

template <typename scalar_t>
__global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size)
{

	// n (batch size), c (num of channels), y (height), x (width)
	int n = blockIdx.x;
	int y = blockIdx.y;
	int x = blockIdx.z;

	int ch_off = threadIdx.x;
	scalar_t value;

	int dimcyx = channels * height * width;
	int dimyx = height * width;

	int p_dimx = (width + 2 * pad_size);
	int p_dimy = (height + 2 * pad_size);
	int p_dimyxc = channels * p_dimy * p_dimx;
	int p_dimxc = p_dimx * channels;

	for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) {
		value = input[n * dimcyx + c * dimyx + y * width + x];
		rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value;
	}
}

template <typename scalar_t>
__global__ void correlation_forward(scalar_t*  output, int nOutputChannels, int outputHeight, int outputWidth,
	const scalar_t* __restrict__ rInput1, int nInputChannels, int inputHeight, int inputWidth,
	const scalar_t* __restrict__ rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2)
{
	// n (batch size), c (num of channels), y (height), x (width)

	int pInputWidth = inputWidth + 2 * pad_size;
	int pInputHeight = inputHeight + 2 * pad_size;

	int kernel_rad = (kernel_size - 1) / 2;
	int displacement_rad = max_displacement / stride2;
	int displacement_size = 2 * displacement_rad + 1;

	int n = blockIdx.x;
	int y1 = blockIdx.y * stride1 + max_displacement;
	int x1 = blockIdx.z * stride1 + max_displacement;
	int c = threadIdx.x;

	int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
	int pdimxc = pInputWidth * nInputChannels;
	int pdimc = nInputChannels;

	int tdimcyx = nOutputChannels * outputHeight * outputWidth;
	int tdimyx = outputHeight * outputWidth;
	int tdimx = outputWidth;

	scalar_t nelems = kernel_size * kernel_size * pdimc;

	__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];

	// no significant speed-up in using chip memory for input1 sub-data, 
	// not enough chip memory size to accomodate memory per block for input2 sub-data
	// instead i've used device memory for both 

	// element-wise product along channel axis
	for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
		for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
			prod_sum[c] = 0;
			int x2 = x1 + ti*stride2;
			int y2 = y1 + tj*stride2;

			for (int j = -kernel_rad; j <= kernel_rad; ++j) {
				for (int i = -kernel_rad; i <= kernel_rad; ++i) {
					for (int ch = c; ch < pdimc; ch += THREADS_PER_BLOCK) {
						int indx1 = n * pdimyxc + (y1 + j) * pdimxc + (x1 + i) * pdimc + ch;
						int indx2 = n * pdimyxc + (y2 + j) * pdimxc + (x2 + i) * pdimc + ch;

						prod_sum[c] += rInput1[indx1] * rInput2[indx2];
					}
				}
			}

			// accumulate 
			__syncthreads();
			if (c == 0) {
				scalar_t reduce_sum = 0;
				for (int index = 0; index < THREADS_PER_BLOCK; ++index) {
					reduce_sum += prod_sum[index];
				}
				int tc = (tj + displacement_rad) * displacement_size + (ti + displacement_rad);
				const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx + blockIdx.z;
				output[tindx] = reduce_sum / nelems;
			}

		}
	}

}

template <typename scalar_t>
__global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth,
	const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
	const scalar_t* __restrict__ rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2)
{
	// n (batch size), c (num of channels), y (height), x (width)

	int n = item;
	int y = blockIdx.x * stride1 + pad_size;
	int x = blockIdx.y * stride1 + pad_size;
	int c = blockIdx.z;
	int tch_off = threadIdx.x;

	int kernel_rad = (kernel_size - 1) / 2;
	int displacement_rad = max_displacement / stride2;
	int displacement_size = 2 * displacement_rad + 1;

	int xmin = (x - kernel_rad - max_displacement) / stride1;
	int ymin = (y - kernel_rad - max_displacement) / stride1;

	int xmax = (x + kernel_rad - max_displacement) / stride1;
	int ymax = (y + kernel_rad - max_displacement) / stride1;

	if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
		// assumes gradInput1 is pre-allocated and zero filled
		return;
	}

	if (xmin > xmax || ymin > ymax) {
		// assumes gradInput1 is pre-allocated and zero filled
		return;
	}

	xmin = max(0, xmin);
	xmax = min(outputWidth - 1, xmax);

	ymin = max(0, ymin);
	ymax = min(outputHeight - 1, ymax);

	int pInputWidth = inputWidth + 2 * pad_size;
	int pInputHeight = inputHeight + 2 * pad_size;

	int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
	int pdimxc = pInputWidth * nInputChannels;
	int pdimc = nInputChannels;

	int tdimcyx = nOutputChannels * outputHeight * outputWidth;
	int tdimyx = outputHeight * outputWidth;
	int tdimx = outputWidth;

	int odimcyx = nInputChannels * inputHeight* inputWidth;
	int odimyx = inputHeight * inputWidth;
	int odimx = inputWidth;

	scalar_t nelems = kernel_size * kernel_size * nInputChannels;

	__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
	prod_sum[tch_off] = 0;

	for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {

		int i2 = (tc % displacement_size - displacement_rad) * stride2;
		int j2 = (tc / displacement_size - displacement_rad) * stride2;

		int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c;

		scalar_t val2 = rInput2[indx2];

		for (int j = ymin; j <= ymax; ++j) {
			for (int i = xmin; i <= xmax; ++i) {
				int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
				prod_sum[tch_off] += gradOutput[tindx] * val2;
			}
		}
	}
	__syncthreads();

	if (tch_off == 0) {
		scalar_t reduce_sum = 0;
		for (int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
			reduce_sum += prod_sum[idx];
		}
		const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
		gradInput1[indx1] = reduce_sum / nelems;
	}

}

template <typename scalar_t>
__global__ void correlation_backward_input2(int item, scalar_t*  gradInput2, int nInputChannels, int inputHeight, int inputWidth,
	const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
	const scalar_t* __restrict__ rInput1,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2)
{
	// n (batch size), c (num of channels), y (height), x (width)

	int n = item;
	int y = blockIdx.x * stride1 + pad_size;
	int x = blockIdx.y * stride1 + pad_size;
	int c = blockIdx.z;

	int tch_off = threadIdx.x;

	int kernel_rad = (kernel_size - 1) / 2;
	int displacement_rad = max_displacement / stride2;
	int displacement_size = 2 * displacement_rad + 1;

	int pInputWidth = inputWidth + 2 * pad_size;
	int pInputHeight = inputHeight + 2 * pad_size;

	int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
	int pdimxc = pInputWidth * nInputChannels;
	int pdimc = nInputChannels;

	int tdimcyx = nOutputChannels * outputHeight * outputWidth;
	int tdimyx = outputHeight * outputWidth;
	int tdimx = outputWidth;

	int odimcyx = nInputChannels * inputHeight* inputWidth;
	int odimyx = inputHeight * inputWidth;
	int odimx = inputWidth;

	scalar_t nelems = kernel_size * kernel_size * nInputChannels;

	__shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
	prod_sum[tch_off] = 0;

	for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {
		int i2 = (tc % displacement_size - displacement_rad) * stride2;
		int j2 = (tc / displacement_size - displacement_rad) * stride2;

		int xmin = (x - kernel_rad - max_displacement - i2) / stride1;
		int ymin = (y - kernel_rad - max_displacement - j2) / stride1;

		int xmax = (x + kernel_rad - max_displacement - i2) / stride1;
		int ymax = (y + kernel_rad - max_displacement - j2) / stride1;

		if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
			// assumes gradInput2 is pre-allocated and zero filled
			continue;
		}

		if (xmin > xmax || ymin > ymax) {
			// assumes gradInput2 is pre-allocated and zero filled
			continue;
		}

		xmin = max(0, xmin);
		xmax = min(outputWidth - 1, xmax);

		ymin = max(0, ymin);
		ymax = min(outputHeight - 1, ymax);

		int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c;
		scalar_t val1 = rInput1[indx1];

		for (int j = ymin; j <= ymax; ++j) {
			for (int i = xmin; i <= xmax; ++i) {
				int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
				prod_sum[tch_off] += gradOutput[tindx] * val1;
			}
		}
	}

	__syncthreads();

	if (tch_off == 0) {
		scalar_t reduce_sum = 0;
		for (int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
			reduce_sum += prod_sum[idx];
		}
		const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
		gradInput2[indx2] = reduce_sum / nelems;
	}

}

int correlation_forward_cuda_kernel(at::Tensor& output,
	int ob,
	int oc,
	int oh,
	int ow,
	int osb,
	int osc,
	int osh,
	int osw,

	at::Tensor& input1,
	int ic,
	int ih,
	int iw,
	int isb,
	int isc,
	int ish,
	int isw,

	at::Tensor& input2,
	int gc,
	int gsb,
	int gsc,
	int gsh,
	int gsw,

	at::Tensor& rInput1,
	at::Tensor& rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2,
	int corr_type_multiply,
	cudaStream_t stream)
{

	int batchSize = ob;

	int nInputChannels = ic;
	int inputWidth = iw;
	int inputHeight = ih;

	int nOutputChannels = oc;
	int outputWidth = ow;
	int outputHeight = oh;

	dim3 blocks_grid(batchSize, inputHeight, inputWidth);
	dim3 threads_block(THREADS_PER_BLOCK);

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channels_first_fwd_1", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> >(
			input1.data<scalar_t>(), rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

	}));

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "channels_first_fwd_2", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> > (
			input2.data<scalar_t>(), rInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

	}));

	dim3 threadsPerBlock(THREADS_PER_BLOCK);
	dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth);

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "correlation_forward", ([&] {

		correlation_forward<scalar_t> << <totalBlocksCorr, threadsPerBlock, 0, stream >> >
			(output.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
			rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
			rInput2.data<scalar_t>(),
			pad_size,
			kernel_size,
			max_displacement,
			stride1,
			stride2);

	}));

	cudaError_t err = cudaGetLastError();


	// check for errors
	if (err != cudaSuccess) {
		printf("error in correlation_forward_cuda_kernel: %s\n", cudaGetErrorString(err));
		return 0;
	}

	return 1;
}


int correlation_backward_cuda_kernel(
	at::Tensor& gradOutput,
	int gob,
	int goc,
	int goh,
	int gow,
	int gosb,
	int gosc,
	int gosh,
	int gosw,

	at::Tensor& input1,
	int ic,
	int ih,
	int iw,
	int isb,
	int isc,
	int ish,
	int isw,

	at::Tensor& input2,
	int gsb,
	int gsc,
	int gsh,
	int gsw,

	at::Tensor& gradInput1,
	int gisb,
	int gisc,
	int gish,
	int gisw,

	at::Tensor& gradInput2,
	int ggc,
	int ggsb,
	int ggsc,
	int ggsh,
	int ggsw,

	at::Tensor& rInput1,
	at::Tensor& rInput2,
	int pad_size,
	int kernel_size,
	int max_displacement,
	int stride1,
	int stride2,
	int corr_type_multiply,
	cudaStream_t stream)
{

	int batchSize = gob;
	int num = batchSize;

	int nInputChannels = ic;
	int inputWidth = iw;
	int inputHeight = ih;

	int nOutputChannels = goc;
	int outputWidth = gow;
	int outputHeight = goh;

	dim3 blocks_grid(batchSize, inputHeight, inputWidth);
	dim3 threads_block(THREADS_PER_BLOCK);


	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "lltm_forward_cuda", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> >(
			input1.data<scalar_t>(),
			rInput1.data<scalar_t>(),
			nInputChannels,
			inputHeight,
			inputWidth,
			pad_size
			);
	}));

	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {

		channels_first<scalar_t> << <blocks_grid, threads_block, 0, stream >> >(
			input2.data<scalar_t>(),
			rInput2.data<scalar_t>(),
			nInputChannels,
			inputHeight,
			inputWidth,
			pad_size
			);
	}));

	dim3 threadsPerBlock(THREADS_PER_BLOCK);
	dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels);

	for (int n = 0; n < num; ++n) {

		AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {


			correlation_backward_input1<scalar_t> << <totalBlocksCorr, threadsPerBlock, 0, stream >> > (
				n, gradInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
				gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
				rInput2.data<scalar_t>(),
				pad_size,
				kernel_size,
				max_displacement,
				stride1,
				stride2);
		}));
	}

	for (int n = 0; n < batchSize; n++) {

		AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), "lltm_forward_cuda", ([&] {

			correlation_backward_input2<scalar_t> << <totalBlocksCorr, threadsPerBlock, 0, stream >> >(
				n, gradInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
				gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
				rInput1.data<scalar_t>(),
				pad_size,
				kernel_size,
				max_displacement,
				stride1,
				stride2);

		}));
	}

	// check for errors
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		printf("error in correlation_backward_cuda_kernel: %s\n", cudaGetErrorString(err));
		return 0;
	}

	return 1;
}


================================================
FILE: models/correlation_package_cu9/correlation_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int correlation_forward_cuda_kernel(at::Tensor& output,
    int ob,
    int oc,
    int oh,
    int ow,
    int osb,
    int osc,
    int osh,
    int osw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gc,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


int correlation_backward_cuda_kernel(   
    at::Tensor& gradOutput,
    int gob,
    int goc,
    int goh,
    int gow,
    int gosb,
    int gosc,
    int gosh,
    int gosw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& gradInput1, 
    int gisb,
    int gisc,
    int gish,
    int gisw,

    at::Tensor& gradInput2,
    int ggc,
    int ggsb,
    int ggsc,
    int ggsh,
    int ggsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


================================================
FILE: models/correlation_package_cu9/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

cxx_args = ['-std=c++11']

nvcc_args = [
    '-gencode', 'arch=compute_50,code=sm_50',
    '-gencode', 'arch=compute_52,code=sm_52',
    '-gencode', 'arch=compute_60,code=sm_60',
    '-gencode', 'arch=compute_61,code=sm_61',
    '-gencode', 'arch=compute_61,code=compute_61',
    '-ccbin', '/usr/bin/gcc-4.9'
]

setup(
    name='correlation_cuda',
    ext_modules=[
        CUDAExtension('correlation_cuda', [
            'correlation_cuda.cc',
            'correlation_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args, 'cuda-path': ['/usr/local/cuda-9.0']})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: models/flownet1s.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from .flownet_modules import conv, deconv
from .flownet_modules import concatenate_as, upsample2d_as
from .flownet_modules import initialize_msra


class FlowNetS(nn.Module):
    def __init__(self, args):
        super(FlowNetS, self).__init__()

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv1   = make_conv(   6,   64, kernel_size=7, stride=2)
        self._conv2   = make_conv(  64,  128, kernel_size=5, stride=2)
        self._conv3   = make_conv( 128,  256, kernel_size=5, stride=2)
        self._conv3_1 = make_conv( 256,  256, kernel_size=3, stride=1)
        self._conv4   = make_conv( 256,  512, kernel_size=3, stride=2)
        self._conv4_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv5   = make_conv( 512,  512, kernel_size=3, stride=2)
        self._conv5_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv6   = make_conv( 512, 1024, kernel_size=3, stride=2)
        self._conv6_1 = make_conv(1024, 1024, kernel_size=3, stride=1)

        def make_deconv(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=True, bias=False)

        self._deconv5 = make_deconv(1024    , 512)
        self._deconv4 = make_deconv(1024 + 2, 256)
        self._deconv3 = make_deconv( 768 + 2, 128)
        self._deconv2 = make_deconv( 384 + 2,  64)

        def make_predict(in_planes, out_planes):
            return conv(in_planes, out_planes, kernel_size=3, stride=1, pad=1,
                        nonlinear=False, bias=True)

        self._predict_flow6 = make_predict(1024    , 2)
        self._predict_flow5 = make_predict(1024 + 2, 2)
        self._predict_flow4 = make_predict( 768 + 2, 2)
        self._predict_flow3 = make_predict( 384 + 2, 2)
        self._predict_flow2 = make_predict( 192 + 2, 2)

        def make_upsample(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=False, bias=False)

        self._upsample_flow6_to_5 = make_upsample(2, 2)
        self._upsample_flow5_to_4 = make_upsample(2, 2)
        self._upsample_flow4_to_3 = make_upsample(2, 2)
        self._upsample_flow3_to_2 = make_upsample(2, 2)

        initialize_msra(self.modules())

    def forward(self, inputs):
        conv1 = self._conv1(inputs)
        conv2 = self._conv2(conv1)
        conv3_1 = self._conv3_1(self._conv3(conv2))
        conv4_1 = self._conv4_1(self._conv4(conv3_1))
        conv5_1 = self._conv5_1(self._conv5(conv4_1))
        conv6_1 = self._conv6_1(self._conv6(conv5_1))

        predict_flow6        = self._predict_flow6(conv6_1)

        upsampled_flow6_to_5 = self._upsample_flow6_to_5(predict_flow6)
        deconv5              = self._deconv5(conv6_1)
        concat5              = concatenate_as((conv5_1, deconv5, upsampled_flow6_to_5), conv5_1, dim=1)
        predict_flow5        = self._predict_flow5(concat5)

        upsampled_flow5_to_4 = self._upsample_flow5_to_4(predict_flow5)
        deconv4              = self._deconv4(concat5)
        concat4              = concatenate_as((conv4_1, deconv4, upsampled_flow5_to_4), conv4_1, dim=1)
        predict_flow4        = self._predict_flow4(concat4)

        upsampled_flow4_to_3 = self._upsample_flow4_to_3(predict_flow4)
        deconv3              = self._deconv3(concat4)
        concat3              = concatenate_as((conv3_1, deconv3, upsampled_flow4_to_3), conv3_1, dim=1)
        predict_flow3        = self._predict_flow3(concat3)

        upsampled_flow3_to_2 = self._upsample_flow3_to_2(predict_flow3)
        deconv2              = self._deconv2(concat3)
        concat2              = concatenate_as((conv2, deconv2, upsampled_flow3_to_2), conv2, dim=1)
        predict_flow2        = self._predict_flow2(concat2)

        if self.training:
            return predict_flow2, predict_flow3, predict_flow4, predict_flow5, predict_flow6
        else:
            return predict_flow2


class FlowNet1S(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(FlowNet1S, self).__init__()
        self._flownets = FlowNetS(args)
        self._div_flow = div_flow

    def forward(self, input_dict):
        im1 = input_dict['input1']
        im2 = input_dict['input2']
        inputs = torch.cat((im1, im2), dim=1)

        output_dict = {}
        if self.training:
            flow2, flow3, flow4, flow5, flow6 = self._flownets(inputs)
            output_dict['flow2'] = flow2
            output_dict['flow3'] = flow3
            output_dict['flow4'] = flow4
            output_dict['flow5'] = flow5
            output_dict['flow6'] = flow6
        else:
            flow2 = self._flownets(inputs)
            output_dict['flow1'] = (1.0 / self._div_flow) * upsample2d_as(flow2, im1, mode="bilinear")

        return output_dict


================================================
FILE: models/flownet1s_irr.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from .flownet_modules import conv, deconv
from .flownet_modules import concatenate_as, upsample2d_as
from .flownet_modules import initialize_msra
from .flownet_modules import WarpingLayer

class FlowNetS(nn.Module):
    def __init__(self, args):
        super(FlowNetS, self).__init__()

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv3_1 = make_conv( 256,  256, kernel_size=3, stride=1)
        self._conv4   = make_conv( 256,  512, kernel_size=3, stride=2)
        self._conv4_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv5   = make_conv( 512,  512, kernel_size=3, stride=2)
        self._conv5_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv6   = make_conv( 512, 1024, kernel_size=3, stride=2)
        self._conv6_1 = make_conv(1024, 1024, kernel_size=3, stride=1)

        def make_deconv(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=True, bias=False)

        self._deconv5 = make_deconv(1024    , 512)
        self._deconv4 = make_deconv(1024 + 2, 256)
        self._deconv3 = make_deconv( 768 + 2, 128)
        self._deconv2 = make_deconv( 384 + 2,  64)

        def make_predict(in_planes, out_planes):
            return conv(in_planes, out_planes, kernel_size=3, stride=1, pad=1,
                        nonlinear=False, bias=True)

        self._predict_flow6 = make_predict(1024    , 2)
        self._predict_flow5 = make_predict(1024 + 2, 2)
        self._predict_flow4 = make_predict( 768 + 2, 2)
        self._predict_flow3 = make_predict( 384 + 2, 2)
        self._predict_flow2 = make_predict( 128 + 2, 2)

        def make_upsample(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=False, bias=False)

        self._upsample_flow6_to_5 = make_upsample(2, 2)
        self._upsample_flow5_to_4 = make_upsample(2, 2)
        self._upsample_flow4_to_3 = make_upsample(2, 2)
        self._upsample_flow3_to_2 = make_upsample(2, 2)

    def forward(self, conv2_im1, conv3_im1, conv3_im2):

        conv_concat3 = torch.cat((conv3_im1, conv3_im2), dim=1)

        conv3_1 = self._conv3_1(conv_concat3)
        conv4_1 = self._conv4_1(self._conv4(conv3_1))
        conv5_1 = self._conv5_1(self._conv5(conv4_1))
        conv6_1 = self._conv6_1(self._conv6(conv5_1))

        predict_flow6        = self._predict_flow6(conv6_1)

        upsampled_flow6_to_5 = self._upsample_flow6_to_5(predict_flow6)
        deconv5              = self._deconv5(conv6_1)
        concat5              = concatenate_as((conv5_1, deconv5, upsampled_flow6_to_5), conv5_1, dim=1)
        predict_flow5        = self._predict_flow5(concat5)

        upsampled_flow5_to_4 = self._upsample_flow5_to_4(predict_flow5)
        deconv4              = self._deconv4(concat5)
        concat4              = concatenate_as((conv4_1, deconv4, upsampled_flow5_to_4), conv4_1, dim=1)
        predict_flow4        = self._predict_flow4(concat4)

        upsampled_flow4_to_3 = self._upsample_flow4_to_3(predict_flow4)
        deconv3              = self._deconv3(concat4)
        concat3              = concatenate_as((conv3_1, deconv3, upsampled_flow4_to_3), conv3_1, dim=1)
        predict_flow3        = self._predict_flow3(concat3)

        upsampled_flow3_to_2 = self._upsample_flow3_to_2(predict_flow3)
        deconv2              = self._deconv2(concat3)
        concat2              = concatenate_as((conv2_im1, deconv2, upsampled_flow3_to_2), conv2_im1, dim=1)
        predict_flow2        = self._predict_flow2(concat2)

        return predict_flow2, predict_flow3, predict_flow4, predict_flow5, predict_flow6


class FlowNet1S(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(FlowNet1S, self).__init__()
        self._flownets = FlowNetS(args)
        self._warping_layer = WarpingLayer()
        self._div_flow = div_flow
        self._num_iters = args.num_iters

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv1   = make_conv(   3,   32, kernel_size=7, stride=2)
        self._conv2   = make_conv(  32,   64, kernel_size=5, stride=2)
        self._conv3   = make_conv(  64,  128, kernel_size=5, stride=2)

        initialize_msra(self.modules())

    def forward(self, input_dict):

        im1 = input_dict['input1']
        im2 = input_dict['input2']

        conv1_im1 = self._conv1(im1)
        conv2_im1 = self._conv2(conv1_im1)
        conv3_im1 = self._conv3(conv2_im1)

        conv1_im2 = self._conv1(im2)
        conv2_im2 = self._conv2(conv1_im2)
        conv3_im2_orig = self._conv3(conv2_im2)
        conv3_im2 = conv3_im2_orig

        output_dict = {}        
        output_dict['flow2'] = []
        output_dict['flow3'] = []
        output_dict['flow4'] = []
        output_dict['flow5'] = []
        output_dict['flow6'] = []
        
        _, _, height_im, width_im = im1.size()

        # for iterative
        for ii in range(0, self._num_iters):
            flow2, flow3, flow4, flow5, flow6 = self._flownets(conv2_im1, conv3_im1, conv3_im2)

            if ii == 0:
                output_dict['flow2'].append(flow2)
                output_dict['flow3'].append(flow3)
                output_dict['flow4'].append(flow4)
                output_dict['flow5'].append(flow5)
                output_dict['flow6'].append(flow6)
            else:
                output_dict['flow2'].append(flow2 + output_dict['flow2'][ii - 1])
                output_dict['flow3'].append(flow3 + output_dict['flow3'][ii - 1])
                output_dict['flow4'].append(flow4 + output_dict['flow4'][ii - 1])
                output_dict['flow5'].append(flow5 + output_dict['flow5'][ii - 1])
                output_dict['flow6'].append(flow6 + output_dict['flow6'][ii - 1])

            if ii < (self._num_iters - 1):
                up_flow = upsample2d_as(output_dict['flow2'][ii], conv3_im2_orig, mode="bilinear")
                conv3_im2 = self._warping_layer(conv3_im2_orig, up_flow, height_im, width_im, self._div_flow)

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            up_flow_final = upsample2d_as(output_dict['flow2'][self._num_iters - 1], im1, mode="bilinear")
            output_dict_eval['flow1'] = (1.0 / self._div_flow) * up_flow_final
            return output_dict_eval

================================================
FILE: models/flownet1s_irr_bi.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from .flownet_modules import conv, deconv
from .flownet_modules import concatenate_as, upsample2d_as
from .flownet_modules import initialize_msra
from .flownet_modules import WarpingLayer

class FlowNetS(nn.Module):
    def __init__(self, args):
        super(FlowNetS, self).__init__()

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv3_1 = make_conv( 256,  256, kernel_size=3, stride=1)
        self._conv4   = make_conv( 256,  512, kernel_size=3, stride=2)
        self._conv4_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv5   = make_conv( 512,  512, kernel_size=3, stride=2)
        self._conv5_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv6   = make_conv( 512, 1024, kernel_size=3, stride=2)
        self._conv6_1 = make_conv(1024, 1024, kernel_size=3, stride=1)

        def make_deconv(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=True, bias=False)

        self._deconv5 = make_deconv(1024    , 512)
        self._deconv4 = make_deconv(1024 + 2, 256)
        self._deconv3 = make_deconv( 768 + 2, 128)
        self._deconv2 = make_deconv( 384 + 2,  64)

        def make_predict(in_planes, out_planes):
            return conv(in_planes, out_planes, kernel_size=3, stride=1, pad=1,
                        nonlinear=False, bias=True)

        self._predict_flow6 = make_predict(1024    , 2)
        self._predict_flow5 = make_predict(1024 + 2, 2)
        self._predict_flow4 = make_predict( 768 + 2, 2)
        self._predict_flow3 = make_predict( 384 + 2, 2)
        self._predict_flow2 = make_predict( 128 + 2, 2)

        def make_upsample(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=False, bias=False)

        self._upsample_flow6_to_5 = make_upsample(2, 2)
        self._upsample_flow5_to_4 = make_upsample(2, 2)
        self._upsample_flow4_to_3 = make_upsample(2, 2)
        self._upsample_flow3_to_2 = make_upsample(2, 2)

    def forward(self, conv2_im1, conv3_im1, conv3_im2):

        conv_concat3 = torch.cat((conv3_im1, conv3_im2), dim=1)

        conv3_1 = self._conv3_1(conv_concat3)
        conv4_1 = self._conv4_1(self._conv4(conv3_1))
        conv5_1 = self._conv5_1(self._conv5(conv4_1))
        conv6_1 = self._conv6_1(self._conv6(conv5_1))

        # Flow Decoder
        predict_flow6        = self._predict_flow6(conv6_1)

        upsampled_flow6_to_5 = self._upsample_flow6_to_5(predict_flow6)
        deconv5              = self._deconv5(conv6_1)
        concat5              = concatenate_as((conv5_1, deconv5, upsampled_flow6_to_5), conv5_1, dim=1)
        predict_flow5        = self._predict_flow5(concat5)

        upsampled_flow5_to_4 = self._upsample_flow5_to_4(predict_flow5)
        deconv4              = self._deconv4(concat5)
        concat4              = concatenate_as((conv4_1, deconv4, upsampled_flow5_to_4), conv4_1, dim=1)
        predict_flow4        = self._predict_flow4(concat4)

        upsampled_flow4_to_3 = self._upsample_flow4_to_3(predict_flow4)
        deconv3              = self._deconv3(concat4)
        concat3              = concatenate_as((conv3_1, deconv3, upsampled_flow4_to_3), conv3_1, dim=1)
        predict_flow3        = self._predict_flow3(concat3)

        upsampled_flow3_to_2 = self._upsample_flow3_to_2(predict_flow3)
        deconv2              = self._deconv2(concat3)
        concat2              = concatenate_as((conv2_im1, deconv2, upsampled_flow3_to_2), conv2_im1, dim=1)
        predict_flow2        = self._predict_flow2(concat2)

        return predict_flow2, predict_flow3, predict_flow4, predict_flow5, predict_flow6


class FlowNet1S(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(FlowNet1S, self).__init__()
        self._flownets = FlowNetS(args)
        self._warping_layer = WarpingLayer()
        self._div_flow = div_flow
        self._num_iters = args.num_iters     

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv1   = make_conv(   3,   32, kernel_size=7, stride=2)
        self._conv2   = make_conv(  32,   64, kernel_size=5, stride=2)
        self._conv3   = make_conv(  64,  128, kernel_size=5, stride=2)

        initialize_msra(self.modules())

    def forward(self, input_dict):
        im1 = input_dict['input1']
        im2 = input_dict['input2']

        conv1_im1 = self._conv1(im1)
        conv2_im1 = self._conv2(conv1_im1)
        conv3_im1 = self._conv3(conv2_im1)
        conv2_im1_wp = conv2_im1
        conv3_im1_wp = conv3_im1

        conv1_im2 = self._conv1(im2)
        conv2_im2 = self._conv2(conv1_im2)
        conv3_im2 = self._conv3(conv2_im2)
        conv2_im2_wp = conv2_im2
        conv3_im2_wp = conv3_im2

        out_dict = {}
        out_dict['flow2'] = []
        out_dict['flow3'] = []
        out_dict['flow4'] = []
        out_dict['flow5'] = []
        out_dict['flow6'] = []

        _, _, height_im, width_im = im1.size()

        # for iterative
        for ii in range(0, self._num_iters):
            flo2_f, flo3_f, flo4_f, flo5_f, flo6_f = self._flownets(conv2_im1, conv3_im1, conv3_im2_wp)
            flo2_b, flo3_b, flo4_b, flo5_b, flo6_b = self._flownets(conv2_im2, conv3_im2, conv3_im1_wp)

            if ii == 0:
                out_dict['flow2'].append([flo2_f, flo2_b])
                out_dict['flow3'].append([flo3_f, flo3_b])
                out_dict['flow4'].append([flo4_f, flo4_b])
                out_dict['flow5'].append([flo5_f, flo5_b])
                out_dict['flow6'].append([flo6_f, flo6_b])
            else:
                out_dict['flow2'].append([flo2_f + out_dict['flow2'][ii - 1][0], flo2_b + out_dict['flow2'][ii - 1][1]])
                out_dict['flow3'].append([flo3_f + out_dict['flow3'][ii - 1][0], flo3_b + out_dict['flow3'][ii - 1][1]])
                out_dict['flow4'].append([flo4_f + out_dict['flow4'][ii - 1][0], flo4_b + out_dict['flow4'][ii - 1][1]])
                out_dict['flow5'].append([flo5_f + out_dict['flow5'][ii - 1][0], flo5_b + out_dict['flow5'][ii - 1][1]])
                out_dict['flow6'].append([flo6_f + out_dict['flow6'][ii - 1][0], flo6_b + out_dict['flow6'][ii - 1][1]])

            if ii < (self._num_iters - 1):
                up_flow_f_c3 = upsample2d_as(out_dict['flow2'][ii][0], conv3_im2, mode="bilinear")
                up_flow_b_c3 = upsample2d_as(out_dict['flow2'][ii][1], conv3_im1, mode="bilinear")
                conv3_im2_wp = self._warping_layer(conv3_im2, up_flow_f_c3, height_im, width_im, self._div_flow)
                conv3_im1_wp = self._warping_layer(conv3_im1, up_flow_b_c3, height_im, width_im, self._div_flow)

        if self.training:
            return out_dict
        else:
            out_dict_eval = {}
            up_flow_final = upsample2d_as(out_dict['flow2'][self._num_iters - 1][0], im1, mode="bilinear")
            out_dict_eval['flow1'] = (1.0 / self._div_flow) * up_flow_final
            return out_dict_eval


================================================
FILE: models/flownet1s_irr_occ.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from .flownet_modules import conv, deconv
from .flownet_modules import concatenate_as, upsample2d_as
from .flownet_modules import initialize_msra
from .flownet_modules import WarpingLayer

class FlowNetS(nn.Module):
    def __init__(self, args):
        super(FlowNetS, self).__init__()

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv3_1 = make_conv( 256,  256, kernel_size=3, stride=1)
        self._conv4   = make_conv( 256,  512, kernel_size=3, stride=2)
        self._conv4_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv5   = make_conv( 512,  512, kernel_size=3, stride=2)
        self._conv5_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv6   = make_conv( 512, 1024, kernel_size=3, stride=2)
        self._conv6_1 = make_conv(1024, 1024, kernel_size=3, stride=1)

        def make_deconv(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=True, bias=False)

        self._deconv5 = make_deconv(1024    , 512)
        self._deconv4 = make_deconv(1024 + 2, 256)
        self._deconv3 = make_deconv( 768 + 2, 128)
        self._deconv2 = make_deconv( 384 + 2,  64)

        self._deconv_occ5 = make_deconv(1024    , 512)
        self._deconv_occ4 = make_deconv(1024 + 1, 256)
        self._deconv_occ3 = make_deconv( 768 + 1, 128)
        self._deconv_occ2 = make_deconv( 384 + 1,  64)

        def make_predict(in_planes, out_planes):
            return conv(in_planes, out_planes, kernel_size=3, stride=1, pad=1,
                        nonlinear=False, bias=True)

        self._predict_flow6 = make_predict(1024    , 2)
        self._predict_flow5 = make_predict(1024 + 2, 2)
        self._predict_flow4 = make_predict( 768 + 2, 2)
        self._predict_flow3 = make_predict( 384 + 2, 2)
        self._predict_flow2 = make_predict( 128 + 2, 2)

        self._predict_occ6 = make_predict(1024    , 1)
        self._predict_occ5 = make_predict(1024 + 1, 1)
        self._predict_occ4 = make_predict( 768 + 1, 1)
        self._predict_occ3 = make_predict( 384 + 1, 1)
        self._predict_occ2 = make_predict( 128 + 1, 1)

        def make_upsample(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=False, bias=False)

        self._upsample_flow6_to_5 = make_upsample(2, 2)
        self._upsample_flow5_to_4 = make_upsample(2, 2)
        self._upsample_flow4_to_3 = make_upsample(2, 2)
        self._upsample_flow3_to_2 = make_upsample(2, 2)

        self._upsample_occ6_to_5 = make_upsample(1, 1)
        self._upsample_occ5_to_4 = make_upsample(1, 1)
        self._upsample_occ4_to_3 = make_upsample(1, 1)
        self._upsample_occ3_to_2 = make_upsample(1, 1)

    def forward(self, conv2_im1, conv3_im1, conv3_im2):

        conv_concat3 = torch.cat((conv3_im1, conv3_im2), dim=1)

        conv3_1 = self._conv3_1(conv_concat3)
        conv4_1 = self._conv4_1(self._conv4(conv3_1))
        conv5_1 = self._conv5_1(self._conv5(conv4_1))
        conv6_1 = self._conv6_1(self._conv6(conv5_1))

        # Flow Decoder
        predict_flow6        = self._predict_flow6(conv6_1)

        upsampled_flow6_to_5 = self._upsample_flow6_to_5(predict_flow6)
        deconv5              = self._deconv5(conv6_1)
        concat5              = concatenate_as((conv5_1, deconv5, upsampled_flow6_to_5), conv5_1, dim=1)
        predict_flow5        = self._predict_flow5(concat5)

        upsampled_flow5_to_4 = self._upsample_flow5_to_4(predict_flow5)
        deconv4              = self._deconv4(concat5)
        concat4              = concatenate_as((conv4_1, deconv4, upsampled_flow5_to_4), conv4_1, dim=1)
        predict_flow4        = self._predict_flow4(concat4)

        upsampled_flow4_to_3 = self._upsample_flow4_to_3(predict_flow4)
        deconv3              = self._deconv3(concat4)
        concat3              = concatenate_as((conv3_1, deconv3, upsampled_flow4_to_3), conv3_1, dim=1)
        predict_flow3        = self._predict_flow3(concat3)

        upsampled_flow3_to_2 = self._upsample_flow3_to_2(predict_flow3)
        deconv2              = self._deconv2(concat3)
        concat2              = concatenate_as((conv2_im1, deconv2, upsampled_flow3_to_2), conv2_im1, dim=1)
        predict_flow2        = self._predict_flow2(concat2)

        # Occ Decoder
        predict_occ6 = self._predict_occ6(conv6_1)

        upsampled_occ6_to_5 = self._upsample_occ6_to_5(predict_occ6)
        deconv_occ5         = self._deconv_occ5(conv6_1)
        concat_occ5         = concatenate_as((conv5_1, deconv_occ5, upsampled_occ6_to_5), conv5_1, dim=1)
        predict_occ5        = self._predict_occ5(concat_occ5)

        upsampled_occ5_to_4 = self._upsample_occ5_to_4(predict_occ5)
        deconv_occ4         = self._deconv_occ4(concat_occ5)
        concat_occ4         = concatenate_as((conv4_1, deconv_occ4, upsampled_occ5_to_4), conv4_1, dim=1)
        predict_occ4        = self._predict_occ4(concat_occ4)

        upsampled_occ4_to_3 = self._upsample_occ4_to_3(predict_occ4)
        deconv_occ3         = self._deconv_occ3(concat_occ4)
        concat_occ3         = concatenate_as((conv3_1, deconv_occ3, upsampled_occ4_to_3), conv3_1, dim=1)
        predict_occ3        = self._predict_occ3(concat_occ3)

        upsampled_occ3_to_2 = self._upsample_occ3_to_2(predict_occ3)
        deconv_occ2         = self._deconv_occ2(concat_occ3)
        concat_occ2         = concatenate_as((conv2_im1, deconv_occ2, upsampled_occ3_to_2), conv2_im1, dim=1)
        predict_occ2        = self._predict_occ2(concat_occ2)

        return predict_flow2, predict_flow3, predict_flow4, predict_flow5, predict_flow6, predict_occ2, predict_occ3, predict_occ4, predict_occ5, predict_occ6


class FlowNet1S(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(FlowNet1S, self).__init__()
        self._flownets = FlowNetS(args)
        self._warping_layer = WarpingLayer()
        self._div_flow = div_flow
        self._num_iters = args.num_iters

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv1   = make_conv(   3,   32, kernel_size=7, stride=2)
        self._conv2   = make_conv(  32,   64, kernel_size=5, stride=2)
        self._conv3   = make_conv(  64,  128, kernel_size=5, stride=2)

        initialize_msra(self.modules())

    def forward(self, input_dict):
        im1 = input_dict['input1']
        im2 = input_dict['input2']

        conv1_im1 = self._conv1(im1)
        conv2_im1 = self._conv2(conv1_im1)
        conv3_im1 = self._conv3(conv2_im1)

        conv1_im2 = self._conv1(im2)
        conv2_im2 = self._conv2(conv1_im2)
        conv3_im2 = self._conv3(conv2_im2)
        conv3_im2_wp = conv3_im2

        output_dict = {}
        output_dict['flow2'] = []
        output_dict['flow3'] = []
        output_dict['flow4'] = []
        output_dict['flow5'] = []
        output_dict['flow6'] = []
        output_dict['occ2'] = []
        output_dict['occ3'] = []
        output_dict['occ4'] = []
        output_dict['occ5'] = []
        output_dict['occ6'] = []

        _, _, height_im, width_im = im1.size()
        
        # for iterative
        for ii in range(0, self._num_iters):
            flow2, flow3, flow4, flow5, flow6, occ2, occ3, occ4, occ5, occ6 = self._flownets(conv2_im1, conv3_im1, conv3_im2_wp)

            if ii == 0:
                output_dict['flow2'].append(flow2)
                output_dict['flow3'].append(flow3)
                output_dict['flow4'].append(flow4)
                output_dict['flow5'].append(flow5)
                output_dict['flow6'].append(flow6)
                output_dict['occ2'].append(occ2)
                output_dict['occ3'].append(occ3)
                output_dict['occ4'].append(occ4)
                output_dict['occ5'].append(occ5)
                output_dict['occ6'].append(occ6)
            else:
                output_dict['flow2'].append(flow2 + output_dict['flow2'][ii - 1])
                output_dict['flow3'].append(flow3 + output_dict['flow3'][ii - 1])
                output_dict['flow4'].append(flow4 + output_dict['flow4'][ii - 1])
                output_dict['flow5'].append(flow5 + output_dict['flow5'][ii - 1])
                output_dict['flow6'].append(flow6 + output_dict['flow6'][ii - 1])
                output_dict['occ2'].append(occ2 + output_dict['occ2'][ii - 1])
                output_dict['occ3'].append(occ3 + output_dict['occ3'][ii - 1])
                output_dict['occ4'].append(occ4 + output_dict['occ4'][ii - 1])
                output_dict['occ5'].append(occ5 + output_dict['occ5'][ii - 1])
                output_dict['occ6'].append(occ6 + output_dict['occ6'][ii - 1])

            if ii < (self._num_iters - 1):
                up_flow = upsample2d_as(output_dict['flow2'][ii], conv3_im2, mode="bilinear")
                conv3_im2_wp = self._warping_layer(conv3_im2, up_flow, height_im, width_im, self._div_flow)     

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            up_flow_final = upsample2d_as(output_dict['flow2'][self._num_iters - 1], im1, mode="bilinear")  
            up_occ_final = upsample2d_as(output_dict['occ2'][self._num_iters - 1], im1, mode="bilinear")
            output_dict_eval['flow1'] = (1.0 / self._div_flow) * up_flow_final
            output_dict_eval['occ1'] = up_occ_final
            return output_dict_eval

================================================
FILE: models/flownet1s_irr_occ_bi.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from .flownet_modules import conv, deconv
from .flownet_modules import concatenate_as, upsample2d_as
from .flownet_modules import initialize_msra
from .flownet_modules import WarpingLayer

class FlowNetS(nn.Module):
    def __init__(self, args):
        super(FlowNetS, self).__init__()

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv3_1 = make_conv( 256,  256, kernel_size=3, stride=1)
        self._conv4   = make_conv( 256,  512, kernel_size=3, stride=2)
        self._conv4_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv5   = make_conv( 512,  512, kernel_size=3, stride=2)
        self._conv5_1 = make_conv( 512,  512, kernel_size=3, stride=1)
        self._conv6   = make_conv( 512, 1024, kernel_size=3, stride=2)
        self._conv6_1 = make_conv(1024, 1024, kernel_size=3, stride=1)

        def make_deconv(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=True, bias=False)

        self._deconv5 = make_deconv(1024    , 512)
        self._deconv4 = make_deconv(1024 + 2, 256)
        self._deconv3 = make_deconv( 768 + 2, 128)
        self._deconv2 = make_deconv( 384 + 2,  64)

        self._deconv_occ5 = make_deconv(1024    , 512)
        self._deconv_occ4 = make_deconv(1024 + 1, 256)
        self._deconv_occ3 = make_deconv( 768 + 1, 128)
        self._deconv_occ2 = make_deconv( 384 + 1,  64)

        def make_predict(in_planes, out_planes):
            return conv(in_planes, out_planes, kernel_size=3, stride=1, pad=1,
                        nonlinear=False, bias=True)

        self._predict_flow6 = make_predict(1024    , 2)
        self._predict_flow5 = make_predict(1024 + 2, 2)
        self._predict_flow4 = make_predict( 768 + 2, 2)
        self._predict_flow3 = make_predict( 384 + 2, 2)
        self._predict_flow2 = make_predict( 128 + 2, 2)

        self._predict_occ6 = make_predict(1024    , 1)
        self._predict_occ5 = make_predict(1024 + 1, 1)
        self._predict_occ4 = make_predict( 768 + 1, 1)
        self._predict_occ3 = make_predict( 384 + 1, 1)
        self._predict_occ2 = make_predict( 128 + 1, 1)

        def make_upsample(in_planes, out_planes):
            return deconv(in_planes, out_planes, kernel_size=4, stride=2, pad=1,
                          nonlinear=False, bias=False)

        self._upsample_flow6_to_5 = make_upsample(2, 2)
        self._upsample_flow5_to_4 = make_upsample(2, 2)
        self._upsample_flow4_to_3 = make_upsample(2, 2)
        self._upsample_flow3_to_2 = make_upsample(2, 2)

        self._upsample_occ6_to_5 = make_upsample(1, 1)
        self._upsample_occ5_to_4 = make_upsample(1, 1)
        self._upsample_occ4_to_3 = make_upsample(1, 1)
        self._upsample_occ3_to_2 = make_upsample(1, 1)

    def forward(self, conv2_im1, conv3_im1, conv3_im2):

        conv_concat3 = torch.cat((conv3_im1, conv3_im2), dim=1)

        conv3_1 = self._conv3_1(conv_concat3)
        conv4_1 = self._conv4_1(self._conv4(conv3_1))
        conv5_1 = self._conv5_1(self._conv5(conv4_1))
        conv6_1 = self._conv6_1(self._conv6(conv5_1))

        # Flow Decoder
        predict_flow6        = self._predict_flow6(conv6_1)

        upsampled_flow6_to_5 = self._upsample_flow6_to_5(predict_flow6)
        deconv5              = self._deconv5(conv6_1)
        concat5              = concatenate_as((conv5_1, deconv5, upsampled_flow6_to_5), conv5_1, dim=1)
        predict_flow5        = self._predict_flow5(concat5)

        upsampled_flow5_to_4 = self._upsample_flow5_to_4(predict_flow5)
        deconv4              = self._deconv4(concat5)
        concat4              = concatenate_as((conv4_1, deconv4, upsampled_flow5_to_4), conv4_1, dim=1)
        predict_flow4        = self._predict_flow4(concat4)

        upsampled_flow4_to_3 = self._upsample_flow4_to_3(predict_flow4)
        deconv3              = self._deconv3(concat4)
        concat3              = concatenate_as((conv3_1, deconv3, upsampled_flow4_to_3), conv3_1, dim=1)
        predict_flow3        = self._predict_flow3(concat3)

        upsampled_flow3_to_2 = self._upsample_flow3_to_2(predict_flow3)
        deconv2              = self._deconv2(concat3)
        concat2              = concatenate_as((conv2_im1, deconv2, upsampled_flow3_to_2), conv2_im1, dim=1)
        predict_flow2        = self._predict_flow2(concat2)

        # Occ Decoder
        predict_occ6 = self._predict_occ6(conv6_1)

        upsampled_occ6_to_5 = self._upsample_occ6_to_5(predict_occ6)
        deconv_occ5         = self._deconv_occ5(conv6_1)
        concat_occ5         = concatenate_as((conv5_1, deconv_occ5, upsampled_occ6_to_5), conv5_1, dim=1)
        predict_occ5        = self._predict_occ5(concat_occ5)

        upsampled_occ5_to_4 = self._upsample_occ5_to_4(predict_occ5)
        deconv_occ4         = self._deconv_occ4(concat_occ5)
        concat_occ4         = concatenate_as((conv4_1, deconv_occ4, upsampled_occ5_to_4), conv4_1, dim=1)
        predict_occ4        = self._predict_occ4(concat_occ4)

        upsampled_occ4_to_3 = self._upsample_occ4_to_3(predict_occ4)
        deconv_occ3         = self._deconv_occ3(concat_occ4)
        concat_occ3         = concatenate_as((conv3_1, deconv_occ3, upsampled_occ4_to_3), conv3_1, dim=1)
        predict_occ3        = self._predict_occ3(concat_occ3)

        upsampled_occ3_to_2 = self._upsample_occ3_to_2(predict_occ3)
        deconv_occ2         = self._deconv_occ2(concat_occ3)
        concat_occ2         = concatenate_as((conv2_im1, deconv_occ2, upsampled_occ3_to_2), conv2_im1, dim=1)
        predict_occ2        = self._predict_occ2(concat_occ2)

        return predict_flow2, predict_flow3, predict_flow4, predict_flow5, predict_flow6, predict_occ2, predict_occ3, predict_occ4, predict_occ5, predict_occ6


class FlowNet1S(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(FlowNet1S, self).__init__()
        self._flownets = FlowNetS(args)
        self._warping_layer = WarpingLayer()
        self._div_flow = div_flow
        self._num_iters = args.num_iters

        def make_conv(in_planes, out_planes, kernel_size, stride):
            pad = kernel_size // 2
            return conv(in_planes, out_planes, kernel_size=kernel_size,
                        stride=stride, pad=pad, nonlinear=True, bias=True)

        self._conv1   = make_conv(   3,   32, kernel_size=7, stride=2)
        self._conv2   = make_conv(  32,   64, kernel_size=5, stride=2)
        self._conv3   = make_conv(  64,  128, kernel_size=5, stride=2)

        initialize_msra(self.modules())

    def forward(self, input_dict):
        im1 = input_dict['input1']
        im2 = input_dict['input2']

        conv1_im1 = self._conv1(im1)
        conv2_im1 = self._conv2(conv1_im1)
        conv3_im1 = self._conv3(conv2_im1)
        conv3_im1_wp = conv3_im1

        conv1_im2 = self._conv1(im2)
        conv2_im2 = self._conv2(conv1_im2)
        conv3_im2 = self._conv3(conv2_im2)
        conv3_im2_wp = conv3_im2

        out_dict = {}        
        out_dict['flow2'] = []
        out_dict['flow3'] = []
        out_dict['flow4'] = []
        out_dict['flow5'] = []
        out_dict['flow6'] = []
        out_dict['occ2'] = []
        out_dict['occ3'] = []
        out_dict['occ4'] = []
        out_dict['occ5'] = []
        out_dict['occ6'] = []

        _, _, height_im, width_im = im1.size()

        # for iterative
        for ii in range(0, self._num_iters):
            flo2_f, flo3_f, flo4_f, flo5_f, flo6_f, occ2_f, occ3_f, occ4_f, occ5_f, occ6_f = self._flownets(conv2_im1,
                                                                                                            conv3_im1,
                                                                                                            conv3_im2_wp)
            flo2_b, flo3_b, flo4_b, flo5_b, flo6_b, occ2_b, occ3_b, occ4_b, occ5_b, occ6_b = self._flownets(conv2_im2,
                                                                                                            conv3_im2,
                                                                                                            conv3_im1_wp)

            if ii == 0:
                out_dict['flow2'].append([flo2_f, flo2_b])
                out_dict['flow3'].append([flo3_f, flo3_b])
                out_dict['flow4'].append([flo4_f, flo4_b])
                out_dict['flow5'].append([flo5_f, flo5_b])
                out_dict['flow6'].append([flo6_f, flo6_b])
                out_dict['occ2'].append([occ2_f, occ2_b])
                out_dict['occ3'].append([occ3_f, occ3_b])
                out_dict['occ4'].append([occ4_f, occ4_b])
                out_dict['occ5'].append([occ5_f, occ5_b])
                out_dict['occ6'].append([occ6_f, occ6_b])
            else:
                out_dict['flow2'].append([flo2_f + out_dict['flow2'][ii - 1][0], flo2_b + out_dict['flow2'][ii - 1][1]])
                out_dict['flow3'].append([flo3_f + out_dict['flow3'][ii - 1][0], flo3_b + out_dict['flow3'][ii - 1][1]])
                out_dict['flow4'].append([flo4_f + out_dict['flow4'][ii - 1][0], flo4_b + out_dict['flow4'][ii - 1][1]])
                out_dict['flow5'].append([flo5_f + out_dict['flow5'][ii - 1][0], flo5_b + out_dict['flow5'][ii - 1][1]])
                out_dict['flow6'].append([flo6_f + out_dict['flow6'][ii - 1][0], flo6_b + out_dict['flow6'][ii - 1][1]])
                out_dict['occ2'].append([occ2_f + out_dict['occ2'][ii - 1][0], occ2_b + out_dict['occ2'][ii - 1][1]])
                out_dict['occ3'].append([occ3_f + out_dict['occ3'][ii - 1][0], occ3_b + out_dict['occ3'][ii - 1][1]])
                out_dict['occ4'].append([occ4_f + out_dict['occ4'][ii - 1][0], occ4_b + out_dict['occ4'][ii - 1][1]])
                out_dict['occ5'].append([occ5_f + out_dict['occ5'][ii - 1][0], occ5_b + out_dict['occ5'][ii - 1][1]])
                out_dict['occ6'].append([occ6_f + out_dict['occ6'][ii - 1][0], occ6_b + out_dict['occ6'][ii - 1][1]])

            if ii < (self._num_iters - 1):
                up_flow_f_c3 = upsample2d_as(out_dict['flow2'][ii][0], conv3_im2, mode="bilinear")
                up_flow_b_c3 = upsample2d_as(out_dict['flow2'][ii][1], conv3_im1, mode="bilinear")
                conv3_im2_wp = self._warping_layer(conv3_im2, up_flow_f_c3, height_im, width_im, self._div_flow)
                conv3_im1_wp = self._warping_layer(conv3_im1, up_flow_b_c3, height_im, width_im, self._div_flow)

        if self.training:
            return out_dict
        else:
            out_dict_eval = {}
            up_flow_final = upsample2d_as(out_dict['flow2'][self._num_iters - 1][0], im1, mode="bilinear")
            up_occ_final = upsample2d_as(out_dict['occ2'][self._num_iters - 1][0], im1, mode="bilinear")
            out_dict_eval['flow1'] = (1.0 / self._div_flow) * up_flow_final
            out_dict_eval['occ1'] = up_occ_final
            return out_dict_eval


================================================
FILE: models/flownet_modules.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
import torch.nn.functional as tf
import logging


def conv(in_planes, out_planes, kernel_size, stride, pad, nonlinear, bias):
    if nonlinear:
        return nn.Sequential(
            nn.Conv2d(
                in_planes, out_planes, kernel_size=kernel_size,
                stride=stride, padding=pad, bias=bias),
            nn.LeakyReLU(0.1, inplace=True)
        )
    else:
        return nn.Conv2d(
            in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=pad, bias=bias)


def deconv(in_planes, out_planes, kernel_size, stride, pad, nonlinear, bias):
    if nonlinear:
        return nn.Sequential(
            nn.ConvTranspose2d(
                in_planes, out_planes, kernel_size=kernel_size,
                stride=stride, padding=pad, bias=bias),
            nn.LeakyReLU(0.1, inplace=True)
        )
    else:
        return nn.ConvTranspose2d(
            in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=pad, bias=bias)


def resize2D(inputs, size_targets, mode="bilinear"):
    size_inputs = [inputs.size(2), inputs.size(3)]

    if all([size_inputs == size_targets]):
        return inputs  # nothing to do
    elif any([size_targets < size_inputs]):
        resized = tf.adaptive_avg_pool2d(inputs, size_targets)  # downscaling
    else:
        resized = tf.interpolate(inputs, size=size_targets, mode=mode, align_corners=True)

    return resized

def resize2D_as(inputs, output_as, mode="bilinear"):
    size_targets = [output_as.size(2), output_as.size(3)]
    return resize2D(inputs, size_targets, mode=mode)


def concatenate_as(tensor_list, tensor_as, dim, mode="bilinear"):
    tensor_list = [resize2D_as(x, tensor_as, mode=mode) for x in tensor_list]
    return torch.cat(tensor_list, dim=dim)


def upsample2d_as(inputs, target_as, mode="bilinear"):
    _, _, h, w = target_as.size()
    return tf.interpolate(inputs, [h, w], mode=mode, align_corners=True)


def initialize_msra(modules):
    logging.info("Initializing MSRA")
    for layer in modules:
        if isinstance(layer, nn.Conv2d):
            nn.init.kaiming_normal_(layer.weight)
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0)

        elif isinstance(layer, nn.ConvTranspose2d):
            nn.init.kaiming_normal_(layer.weight)
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0)

        elif isinstance(layer, nn.LeakyReLU):
            pass

        elif isinstance(layer, nn.Sequential):
            pass

        elif "models" in str(type(layer)) and "FlowNet" in str(type(layer)):            
            pass


def get_grid(x):
    grid_H = torch.linspace(-1.0, 1.0, x.size(3)).view(1, 1, 1, x.size(3)).expand(x.size(0), 1, x.size(2), x.size(3))
    grid_V = torch.linspace(-1.0, 1.0, x.size(2)).view(1, 1, x.size(2), 1).expand(x.size(0), 1, x.size(2), x.size(3))
    grid = torch.cat([grid_H, grid_V], 1)
    grids_cuda = grid.float().requires_grad_(False).cuda()
    return grids_cuda


class WarpingLayer(nn.Module):
    def __init__(self):
        super(WarpingLayer, self).__init__()

    def forward(self, x, flow, height_im, width_im, div_flow):
        flo_list = []
        flo_w = flow[:, 0] * 2 / width_im / div_flow
        flo_h = flow[:, 1] * 2 / height_im / div_flow
        flo_list.append(flo_w)
        flo_list.append(flo_h)
        flow_for_grid = torch.stack(flo_list).transpose(0, 1)
        grid = torch.add(get_grid(x), flow_for_grid).transpose(1, 2).transpose(2, 3)
        x_warp = tf.grid_sample(x, grid, align_corners=True)

        return x_warp

================================================
FILE: models/irr_modules.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
import torch.nn.functional as tf

def conv(in_planes, out_planes, kernel_size=3, stride=1, dilation=1, isReLU=True):
    if isReLU:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, dilation=dilation,
                      padding=((kernel_size - 1) * dilation) // 2, bias=True),
            nn.LeakyReLU(0.1, inplace=True)
        )
    else:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, dilation=dilation,
                      padding=((kernel_size - 1) * dilation) // 2, bias=True)
        )


def upsample_factor2(inputs, target_as):
    inputs = tf.interpolate(inputs, scale_factor=2, mode="nearest")
    _, _, h, w = target_as.size()
    if inputs.size(2) != h or inputs.size(3) != w:
        return tf.interpolate(inputs, [h, w], mode="bilinear", align_corners=False)
    else:
        return inputs


class OccUpsampleNetwork(nn.Module):
    def __init__(self, ch_in, ch_out):
        super(OccUpsampleNetwork, self).__init__()

        self.feat_dim = 32
        self.init_conv = conv(ch_in, self.feat_dim)

        self.res_convs = nn.Sequential(
            conv(self.feat_dim, self.feat_dim),
            conv(self.feat_dim, self.feat_dim, isReLU=False)
        )
        self.res_end_conv = conv(self.feat_dim, self.feat_dim)
        self.mul_const = 0.1

        self.out_convs = conv(self.feat_dim, ch_out)

    def forward(self, occ, x):
        occ = upsample_factor2(occ, x)
        x_in = torch.cat([occ, x], dim=1)
        x_init = self.init_conv(x_in)
        x_res = x_init
        x_res = x_res + self.res_convs(x_res) * self.mul_const
        x_res = x_res + self.res_convs(x_res) * self.mul_const
        x_res = x_res + self.res_convs(x_res) * self.mul_const
        x_init = x_init + self.res_end_conv(x_res)

        return self.out_convs(x_init) + occ


def subtract_mean(input):
    return input - input.mean(2).mean(2).unsqueeze(2).unsqueeze(2).expand_as(input)

    
class RefineFlow(nn.Module):
    def __init__(self, ch_in):
        super(RefineFlow, self).__init__()

        self.kernel_size = 3
        self.pad_size = 1
        self.pad_ftn = nn.ReplicationPad2d(self.pad_size)

        self.convs = nn.Sequential(
            conv(ch_in, 128, 3, 1, 1),
            conv(128, 128, 3, 1, 1),
            conv(128, 64, 3, 1, 1),
            conv(64, 64, 3, 1, 1),
            conv(64, 32, 3, 1, 1),
            conv(32, 32, 3, 1, 1),
            conv(32, self.kernel_size * self.kernel_size, 3, 1, 1)
        )

        self.softmax_feat = nn.Softmax(dim=1)
        self.unfold_flow = nn.Unfold(kernel_size=(self.kernel_size, self.kernel_size))
        self.unfold_kernel = nn.Unfold(kernel_size=(1, 1))

    def forward(self, flow, diff_img, feature):
        b, _, h, w = flow.size()

        flow_m = subtract_mean(flow)
        norm2_img = torch.norm(diff_img, p=2, dim=1, keepdim=True)

        feat = self.convs(torch.cat([flow_m, norm2_img, feature], dim=1))
        feat_kernel = self.softmax_feat(-feat ** 2)

        flow_x = flow[:, 0].unsqueeze(1)
        flow_y = flow[:, 1].unsqueeze(1)

        flow_x_unfold = self.unfold_flow(self.pad_ftn(flow_x))
        flow_y_unfold = self.unfold_flow(self.pad_ftn(flow_y))
        feat_kernel_unfold = self.unfold_kernel(feat_kernel)

        flow_out_x = torch.sum(flow_x_unfold * feat_kernel_unfold, dim=1).unsqueeze(1).view(b, 1, h, w)
        flow_out_y = torch.sum(flow_y_unfold * feat_kernel_unfold, dim=1).unsqueeze(1).view(b, 1, h, w)

        return torch.cat([flow_out_x, flow_out_y], dim=1)


class RefineOcc(nn.Module):
    def __init__(self, ch_in):
        super(RefineOcc, self).__init__()

        self.kernel_size = 3
        self.pad_size = 1
        self.pad_ftn = nn.ReplicationPad2d(self.pad_size)

        self.convs = nn.Sequential(
            conv(ch_in, 128, 3, 1, 1),
            conv(128, 128, 3, 1, 1),
            conv(128, 64, 3, 1, 1),
            conv(64, 64, 3, 1, 1),
            conv(64, 32, 3, 1, 1),
            conv(32, 32, 3, 1, 1),
            conv(32, self.kernel_size * self.kernel_size, 3, 1, 1)
        )

        self.softmax_feat = nn.Softmax(dim=1)
        self.unfold_occ = nn.Unfold(kernel_size=(self.kernel_size, self.kernel_size))
        self.unfold_kernel = nn.Unfold(kernel_size=(1, 1))

    def forward(self, occ, feat1, feat2):
        b, _, h, w = occ.size()

        feat = self.convs(torch.cat([occ, feat1, feat2], dim=1))
        feat_kernel = self.softmax_feat(-feat ** 2)

        occ_unfold = self.unfold_occ(self.pad_ftn(occ))
        feat_kernel_unfold = self.unfold_kernel(feat_kernel)

        occ_out = torch.sum(occ_unfold * feat_kernel_unfold, dim=1).unsqueeze(1).view(b, 1, h, w)

        return occ_out

================================================
FILE: models/pwc_modules.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
import torch.nn.functional as tf
import logging

def conv(in_planes, out_planes, kernel_size=3, stride=1, dilation=1, isReLU=True):
    if isReLU:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, dilation=dilation,
                      padding=((kernel_size - 1) * dilation) // 2, bias=True),
            nn.LeakyReLU(0.1, inplace=True)
        )
    else:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, dilation=dilation,
                      padding=((kernel_size - 1) * dilation) // 2, bias=True)
        )


def initialize_msra(modules):
    logging.info("Initializing MSRA")
    for layer in modules:
        if isinstance(layer, nn.Conv2d):
            nn.init.kaiming_normal_(layer.weight)
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0)

        elif isinstance(layer, nn.ConvTranspose2d):
            nn.init.kaiming_normal_(layer.weight)
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0)

        elif isinstance(layer, nn.LeakyReLU):
            pass

        elif isinstance(layer, nn.Sequential):
            pass


def compute_cost_volume(feat1, feat2, param_dict):
    """
    only implemented for:
        kernel_size = 1
        stride1 = 1
        stride2 = 1
    """

    max_disp = param_dict["max_disp"]

    _, _, height, width = feat1.size()
    num_shifts = 2 * max_disp + 1
    feat2_padded = tf.pad(feat2, (max_disp, max_disp, max_disp, max_disp), "constant", 0)

    cost_list = []
    for i in range(num_shifts):
        for j in range(num_shifts):
            corr = torch.mean(feat1 * feat2_padded[:, :, i:(height + i), j:(width + j)], axis=1, keepdims=True)
            cost_list.append(corr)
    cost_volume = torch.cat(cost_list, axis=1)
    return cost_volume


def upsample2d_as(inputs, target_as, mode="bilinear"):
    _, _, h, w = target_as.size()
    return tf.interpolate(inputs, [h, w], mode=mode, align_corners=True)


def rescale_flow(flow, div_flow, width_im, height_im, to_local=True):
    if to_local:
        u_scale = float(flow.size(3) / width_im / div_flow)
        v_scale = float(flow.size(2) / height_im / div_flow)
    else:
        u_scale = float(width_im * div_flow / flow.size(3))
        v_scale = float(height_im * div_flow / flow.size(2))

    u, v = flow.chunk(2, dim=1)
    u *= u_scale
    v *= v_scale

    return torch.cat([u, v], dim=1)


class FeatureExtractor(nn.Module):
    def __init__(self, num_chs):
        super(FeatureExtractor, self).__init__()
        self.num_chs = num_chs
        self.convs = nn.ModuleList()

        for l, (ch_in, ch_out) in enumerate(zip(num_chs[:-1], num_chs[1:])):
            layer = nn.Sequential(
                conv(ch_in, ch_out, stride=2),
                conv(ch_out, ch_out)
            )
            self.convs.append(layer)

    def forward(self, x):
        feature_pyramid = []
        for conv in self.convs:
            x = conv(x)
            feature_pyramid.append(x)

        return feature_pyramid[::-1]


def get_grid(x):
    grid_H = torch.linspace(-1.0, 1.0, x.size(3)).view(1, 1, 1, x.size(3)).expand(x.size(0), 1, x.size(2), x.size(3))
    grid_V = torch.linspace(-1.0, 1.0, x.size(2)).view(1, 1, x.size(2), 1).expand(x.size(0), 1, x.size(2), x.size(3))
    grid = torch.cat([grid_H, grid_V], 1)
    grids_cuda = grid.float().requires_grad_(False).cuda()
    return grids_cuda


class WarpingLayer(nn.Module):
    def __init__(self):
        super(WarpingLayer, self).__init__()

    def forward(self, x, flow, height_im, width_im, div_flow):
        flo_list = []
        flo_w = flow[:, 0] * 2 / max(width_im - 1, 1) / div_flow
        flo_h = flow[:, 1] * 2 / max(height_im - 1, 1) / div_flow
        flo_list.append(flo_w)
        flo_list.append(flo_h)
        flow_for_grid = torch.stack(flo_list).transpose(0, 1)
        grid = torch.add(get_grid(x), flow_for_grid).transpose(1, 2).transpose(2, 3)        
        x_warp = tf.grid_sample(x, grid, align_corners=True)

        mask = torch.ones(x.size(), requires_grad=False).cuda()
        mask = tf.grid_sample(mask, grid, align_corners=True)
        mask = (mask >= 1.0).float()

        return x_warp * mask

class OpticalFlowEstimator(nn.Module):
    def __init__(self, ch_in):
        super(OpticalFlowEstimator, self).__init__()

        self.convs = nn.Sequential(
            conv(ch_in, 128),
            conv(128, 128),
            conv(128, 96),
            conv(96, 64),
            conv(64, 32)
        )
        self.conv_last = conv(32, 2, isReLU=False)

    def forward(self, x):
        x_intm = self.convs(x)
        return x_intm, self.conv_last(x_intm)


class FlowEstimatorDense(nn.Module):
    def __init__(self, ch_in):
        super(FlowEstimatorDense, self).__init__()
        self.conv1 = conv(ch_in, 128)
        self.conv2 = conv(ch_in + 128, 128)
        self.conv3 = conv(ch_in + 256, 96)
        self.conv4 = conv(ch_in + 352, 64)
        self.conv5 = conv(ch_in + 416, 32)
        self.conv_last = conv(ch_in + 448, 2, isReLU=False)

    def forward(self, x):
        x1 = torch.cat([self.conv1(x), x], dim=1)
        x2 = torch.cat([self.conv2(x1), x1], dim=1)
        x3 = torch.cat([self.conv3(x2), x2], dim=1)
        x4 = torch.cat([self.conv4(x3), x3], dim=1)
        x5 = torch.cat([self.conv5(x4), x4], dim=1)
        x_out = self.conv_last(x5)
        return x5, x_out


class OcclusionEstimator(nn.Module):
    def __init__(self, ch_in):
        super(OcclusionEstimator, self).__init__()
        self.convs = nn.Sequential(
            conv(ch_in, 128),
            conv(128, 128),
            conv(128, 96),
            conv(96, 64),
            conv(64, 32)
        )
        self.conv_last = conv(32, 1, isReLU=False)

    def forward(self, x):
        x_intm = self.convs(x)
        return x_intm, self.conv_last(x_intm)


class OccEstimatorDense(nn.Module):
    def __init__(self, ch_in):
        super(OccEstimatorDense, self).__init__()
        self.conv1 = conv(ch_in, 128)
        self.conv2 = conv(ch_in + 128, 128)
        self.conv3 = conv(ch_in + 256, 96)
        self.conv4 = conv(ch_in + 352, 64)
        self.conv5 = conv(ch_in + 416, 32)
        self.conv_last = conv(ch_in + 448, 1, isReLU=False)

    def forward(self, x):
        x1 = torch.cat([self.conv1(x), x], dim=1)
        x2 = torch.cat([self.conv2(x1), x1], dim=1)
        x3 = torch.cat([self.conv3(x2), x2], dim=1)
        x4 = torch.cat([self.conv4(x3), x3], dim=1)
        x5 = torch.cat([self.conv5(x4), x4], dim=1)
        x_out = self.conv_last(x5)
        return x5, x_out


class ContextNetwork(nn.Module):
    def __init__(self, ch_in):
        super(ContextNetwork, self).__init__()

        self.convs = nn.Sequential(
            conv(ch_in, 128, 3, 1, 1),
            conv(128, 128, 3, 1, 2),
            conv(128, 128, 3, 1, 4),
            conv(128, 96, 3, 1, 8),
            conv(96, 64, 3, 1, 16),
            conv(64, 32, 3, 1, 1),
            conv(32, 2, isReLU=False)
        )

    def forward(self, x):
        return self.convs(x)


class OccContextNetwork(nn.Module):
    def __init__(self, ch_in):
        super(OccContextNetwork, self).__init__()

        self.convs = nn.Sequential(
            conv(ch_in, 128, 3, 1, 1),
            conv(128, 128, 3, 1, 2),
            conv(128, 128, 3, 1, 4),
            conv(128, 96, 3, 1, 8),
            conv(96, 64, 3, 1, 16),
            conv(64, 32, 3, 1, 1),
            conv(32, 1, isReLU=False)
        )

    def forward(self, x):
        return self.convs(x)


================================================
FILE: models/pwcnet.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, ContextNetwork, FlowEstimatorDense

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.flow_estimators = nn.ModuleList()
        self.dim_corr = (self.search_range * 2 + 1) ** 2
        for l, ch in enumerate(self.num_chs[::-1]):
            if l > self.output_level:
                break

            if l == 0:
                num_ch_in = self.dim_corr
            else:
                num_ch_in = self.dim_corr + ch + 2

            layer = FlowEstimatorDense(num_ch_in)
            self.flow_estimators.append(layer)

        self.context_networks = ContextNetwork(self.dim_corr + 32 + 2 + 448 + 2)
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
            else:
                flow = upsample2d_as(flow, x1, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow, height_im, width_im, self._div_flow)

            # correlation
            out_corr = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_relu = self.leakyRELU(out_corr)

            # flow estimator
            if l == 0:
                x_intm, flow = self.flow_estimators[l](out_corr_relu)
            else:
                x_intm, flow = self.flow_estimators[l](torch.cat([out_corr_relu, x1, flow], dim=1))

            # upsampling or post-processing
            if l != self.output_level:
                flows.append(flow)
            else:
                flow_res = self.context_networks(torch.cat([x_intm, flow], dim=1))
                flow = flow + flow_res
                flows.append(flow)                
                break

        output_dict['flow'] = flows

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            out_flow = upsample2d_as(flow, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
            output_dict_eval['flow'] = out_flow
            return output_dict_eval


================================================
FILE: models/pwcnet_bi.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, ContextNetwork, FlowEstimatorDense

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.flow_estimators = nn.ModuleList()
        self.dim_corr = (self.search_range * 2 + 1) ** 2
        for l, ch in enumerate(self.num_chs[::-1]):
            if l > self.output_level:
                break

            if l == 0:
                num_ch_in = self.dim_corr
            else:
                num_ch_in = self.dim_corr + ch + 2

            layer = FlowEstimatorDense(num_ch_in)
            self.flow_estimators.append(layer)

        self.context_networks = ContextNetwork(self.dim_corr + 32 + 2 + 448 + 2)
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}        
        flows = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow_f = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        flow_b = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
                x1_warp = x1
            else:
                flow_f = upsample2d_as(flow_f, x1, mode="bilinear")
                flow_b = upsample2d_as(flow_b, x2, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow_f, height_im, width_im, self._div_flow)
                x1_warp = self.warping_layer(x1, flow_b, height_im, width_im, self._div_flow)

            # correlation
            out_corr_f = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_b = compute_cost_volume(x2, x1_warp, self.corr_params)
                
            out_corr_relu_f = self.leakyRELU(out_corr_f)
            out_corr_relu_b = self.leakyRELU(out_corr_b)

            # flow estimator
            if l == 0:
                x_intm_f, flow_f = self.flow_estimators[l](out_corr_relu_f)
                x_intm_b, flow_b = self.flow_estimators[l](out_corr_relu_b)
            else:
                x_intm_f, flow_f = self.flow_estimators[l](torch.cat([out_corr_relu_f, x1, flow_f], dim=1))
                x_intm_b, flow_b = self.flow_estimators[l](torch.cat([out_corr_relu_b, x2, flow_b], dim=1))

            # upsampling or post-processing
            if l != self.output_level:
                flows.append([flow_f, flow_b])
            else:
                flow_fine_f = self.context_networks(torch.cat([x_intm_f, flow_f], dim=1))
                flow_fine_b = self.context_networks(torch.cat([x_intm_b, flow_b], dim=1))
                flow_f = flow_f + flow_fine_f
                flow_b = flow_b + flow_fine_b
                flows.append([flow_f, flow_b])
                break

        output_dict['flow'] = flows

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}            
            out_flow = upsample2d_as(flow_f, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
            output_dict_eval['flow'] = out_flow
            return output_dict_eval


================================================
FILE: models/pwcnet_irr.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import conv, rescale_flow, upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, ContextNetwork, FlowEstimatorDense

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.dim_corr = (self.search_range * 2 + 1) ** 2
        self.num_ch_in = self.dim_corr + 32 + 2

        self.flow_estimators = FlowEstimatorDense(self.num_ch_in)

        self.context_networks = ContextNetwork(self.num_ch_in + 448 + 2)

        self.conv_1x1 = nn.ModuleList([conv(196, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(128, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(96, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(64, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(32, 32, kernel_size=1, stride=1, dilation=1)])
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
            else:
                flow = upsample2d_as(flow, x1, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow, height_im, width_im, self._div_flow)

            # correlation
            out_corr = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_relu = self.leakyRELU(out_corr)

            # concat and estimate flow
            flow = rescale_flow(flow, self._div_flow, width_im, height_im, to_local=True)

            x1_1by1 = self.conv_1x1[l](x1)
            x_intm, flow_res = self.flow_estimators(torch.cat([out_corr_relu, x1_1by1, flow], dim=1))
            flow = flow + flow_res

            flow_fine = self.context_networks(torch.cat([x_intm, flow], dim=1))
            flow = flow + flow_fine

            flow = rescale_flow(flow, self._div_flow, width_im, height_im, to_local=False)
            flows.append(flow)

            # upsampling or post-processing
            if l == self.output_level:
                break

        output_dict['flow'] = flows

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            output_dict_eval['flow'] = upsample2d_as(flow, x1_raw, mode="bilinear") * (1.0 / self._div_flow)                
            return output_dict_eval


================================================
FILE: models/pwcnet_irr_bi.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import conv, rescale_flow, upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, ContextNetwork, FlowEstimatorDense

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.dim_corr = (self.search_range * 2 + 1) ** 2
        self.num_ch_in = self.dim_corr + 32 + 2

        self.flow_estimators = FlowEstimatorDense(self.num_ch_in)

        self.context_networks = ContextNetwork(self.num_ch_in + 448 + 2)

        self.conv_1x1 = nn.ModuleList([conv(196, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(128, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(96, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(64, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(32, 32, kernel_size=1, stride=1, dilation=1)])
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow_f = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        flow_b = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
                x1_warp = x1
            else:
                flow_f = upsample2d_as(flow_f, x1, mode="bilinear")
                flow_b = upsample2d_as(flow_b, x2, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow_f, height_im, width_im, self._div_flow)
                x1_warp = self.warping_layer(x1, flow_b, height_im, width_im, self._div_flow)

            # correlation
            out_corr_f = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_b = compute_cost_volume(x2, x1_warp, self.corr_params)
            out_corr_relu_f = self.leakyRELU(out_corr_f)
            out_corr_relu_b = self.leakyRELU(out_corr_b)

            # concat and estimate flow
            flow_f = rescale_flow(flow_f, self._div_flow, width_im, height_im, to_local=True)
            flow_b = rescale_flow(flow_b, self._div_flow, width_im, height_im, to_local=True)

            x1_1by1 = self.conv_1x1[l](x1)
            x2_1by1 = self.conv_1x1[l](x2)
            x_intm_f, flow_res_f = self.flow_estimators(torch.cat([out_corr_relu_f, x1_1by1, flow_f], dim=1))
            x_intm_b, flow_res_b = self.flow_estimators(torch.cat([out_corr_relu_b, x2_1by1, flow_b], dim=1))
            flow_f = flow_f + flow_res_f
            flow_b = flow_b + flow_res_b

            flow_fine_f = self.context_networks(torch.cat([x_intm_f, flow_f], dim=1))
            flow_fine_b = self.context_networks(torch.cat([x_intm_b, flow_b], dim=1))
            flow_f = flow_f + flow_fine_f
            flow_b = flow_b + flow_fine_b

            flow_f = rescale_flow(flow_f, self._div_flow, width_im, height_im, to_local=False)
            flow_b = rescale_flow(flow_b, self._div_flow, width_im, height_im, to_local=False)

            flows.append([flow_f, flow_b])

            # upsampling or post-processing
            if l == self.output_level:
                break

        output_dict['flow'] = flows

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            output_dict_eval['flow'] = upsample2d_as(flow_f, x1_raw, mode="bilinear") * (1.0 / self._div_flow)                
            return output_dict_eval


================================================
FILE: models/pwcnet_irr_occ.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import conv, rescale_flow, upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, FlowEstimatorDense, ContextNetwork, OccEstimatorDense, OccContextNetwork

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.dim_corr = (self.search_range * 2 + 1) ** 2
        self.num_ch_in_flo = self.dim_corr + 32 + 2
        self.num_ch_in_occ = self.dim_corr + 32 + 1

        self.flow_estimators = FlowEstimatorDense(self.num_ch_in_flo)
        self.context_networks = ContextNetwork(self.num_ch_in_flo + 448 + 2)

        self.occ_estimators = OccEstimatorDense(self.num_ch_in_occ)
        self.occ_context_networks = OccContextNetwork(self.num_ch_in_occ + 448 + 1)

        self.conv_1x1 = nn.ModuleList([conv(196, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(128, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(96, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(64, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(32, 32, kernel_size=1, stride=1, dilation=1)])
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []
        occs = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        occ = torch.zeros(b_size, 1, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
            else:
                flow = upsample2d_as(flow, x1, mode="bilinear")
                occ = upsample2d_as(occ, x1, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow, height_im, width_im, self._div_flow)

            # correlation
            out_corr = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_relu = self.leakyRELU(out_corr)

            # concat and estimate flow
            flow = rescale_flow(flow, self._div_flow, width_im, height_im, to_local=True)

            x1_1by1 = self.conv_1x1[l](x1)
            x_intm, flow_res = self.flow_estimators(torch.cat([out_corr_relu, x1_1by1, flow], dim=1))
            flow = flow + flow_res

            flow_fine = self.context_networks(torch.cat([x_intm, flow], dim=1))
            flow = flow + flow_fine

            flow = rescale_flow(flow, self._div_flow, width_im, height_im, to_local=False)
            flows.append(flow)

            x_intm_occ, occ_res = self.occ_estimators(torch.cat([out_corr_relu, x1_1by1, occ], dim=1))
            occ = occ + occ_res

            occ_fine = self.occ_context_networks(torch.cat([x_intm_occ, occ], dim=1))
            occ = occ + occ_fine
            occs.append(occ)

            # upsampling or post-processing
            if l == self.output_level:
                break

        output_dict['flow'] = flows
        output_dict['occ'] = occs

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            output_dict_eval['flow'] = upsample2d_as(flow, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
            output_dict_eval['occ'] = upsample2d_as(occ, x1_raw, mode="bilinear")
            return output_dict_eval


================================================
FILE: models/pwcnet_irr_occ_bi.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import conv, rescale_flow, upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, FlowEstimatorDense, ContextNetwork, OccEstimatorDense, OccContextNetwork

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.dim_corr = (self.search_range * 2 + 1) ** 2
        self.num_ch_in_flo = self.dim_corr + 32 + 2
        self.num_ch_in_occ = self.dim_corr + 32 + 1

        self.flow_estimators = FlowEstimatorDense(self.num_ch_in_flo)
        self.context_networks = ContextNetwork(self.num_ch_in_flo + 448 + 2)
        
        self.occ_estimators = OccEstimatorDense(self.num_ch_in_occ)
        self.occ_context_networks = OccContextNetwork(self.num_ch_in_occ + 448 + 1)
        
        self.conv_1x1 = nn.ModuleList([conv(196, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(128, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(96, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(64, 32, kernel_size=1, stride=1, dilation=1),
                                       conv(32, 32, kernel_size=1, stride=1, dilation=1)])
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []
        occs = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow_f = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        flow_b = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        occ_f = torch.zeros(b_size, 1, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        occ_b = torch.zeros(b_size, 1, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
                x1_warp = x1
            else:
                flow_f = upsample2d_as(flow_f, x1, mode="bilinear")
                flow_b = upsample2d_as(flow_b, x2, mode="bilinear")
                occ_f = upsample2d_as(occ_f, x1, mode="bilinear")
                occ_b = upsample2d_as(occ_b, x2, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow_f, height_im, width_im, self._div_flow)
                x1_warp = self.warping_layer(x1, flow_b, height_im, width_im, self._div_flow)

            # correlation
            out_corr_f = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_b = compute_cost_volume(x2, x1_warp, self.corr_params)
            out_corr_relu_f = self.leakyRELU(out_corr_f)
            out_corr_relu_b = self.leakyRELU(out_corr_b)

            # concat and estimate flow
            flow_f = rescale_flow(flow_f, self._div_flow, width_im, height_im, to_local=True)
            flow_b = rescale_flow(flow_b, self._div_flow, width_im, height_im, to_local=True)

            x1_1by1 = self.conv_1x1[l](x1)
            x2_1by1 = self.conv_1x1[l](x2)
            x_intm_f, flow_res_f = self.flow_estimators(torch.cat([out_corr_relu_f, x1_1by1, flow_f], dim=1))
            x_intm_b, flow_res_b = self.flow_estimators(torch.cat([out_corr_relu_b, x2_1by1, flow_b], dim=1))
            flow_f = flow_f + flow_res_f
            flow_b = flow_b + flow_res_b

            flow_fine_f = self.context_networks(torch.cat([x_intm_f, flow_f], dim=1))
            flow_fine_b = self.context_networks(torch.cat([x_intm_b, flow_b], dim=1))
            flow_f = flow_f + flow_fine_f
            flow_b = flow_b + flow_fine_b

            flow_f = rescale_flow(flow_f, self._div_flow, width_im, height_im, to_local=False)
            flow_b = rescale_flow(flow_b, self._div_flow, width_im, height_im, to_local=False)

            flows.append([flow_f, flow_b])

            # occ estimation
            x_intm_occ_f, occ_res_f = self.occ_estimators(torch.cat([out_corr_relu_f, x1_1by1, occ_f], dim=1))
            x_intm_occ_b, occ_res_b = self.occ_estimators(torch.cat([out_corr_relu_b, x2_1by1, occ_b], dim=1))
            occ_f = occ_f + occ_res_f
            occ_b = occ_b + occ_res_b

            occ_fine_f = self.occ_context_networks(torch.cat([x_intm_occ_f, occ_f], dim=1))
            occ_fine_b = self.occ_context_networks(torch.cat([x_intm_occ_b, occ_b], dim=1))
            occ_f = occ_f + occ_fine_f
            occ_b = occ_b + occ_fine_b
            occs.append([occ_f, occ_b])

            # upsampling or post-processing
            if l == self.output_level:
                break

        output_dict['flow'] = flows
        output_dict['occ'] = occs

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            output_dict_eval['flow'] = upsample2d_as(flow_f, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
            output_dict_eval['occ'] = upsample2d_as(occ_f, x1_raw, mode="bilinear")
            return output_dict_eval


================================================
FILE: models/pwcnet_occ.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, FlowEstimatorDense, ContextNetwork, OccEstimatorDense, OccContextNetwork

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.flow_estimators = nn.ModuleList()
        self.occ_estimators = nn.ModuleList()
        self.dim_corr = (self.search_range * 2 + 1) ** 2
        for l, ch in enumerate(self.num_chs[::-1]):
            if l > self.output_level:
                break

            if l == 0:
                num_ch_in = self.dim_corr
                num_ch_in_occ = self.dim_corr
            else:
                num_ch_in = self.dim_corr + ch + 2
                num_ch_in_occ = self.dim_corr + ch + 1

            layer = FlowEstimatorDense(num_ch_in)
            layer_occ = OccEstimatorDense(num_ch_in_occ)
            self.flow_estimators.append(layer)
            self.occ_estimators.append(layer_occ)

        self.context_networks = ContextNetwork(self.dim_corr + 32 + 2 + 448 + 2)
        self.context_networks_occ = OccContextNetwork(self.dim_corr + 32 + 1 + 448 + 1)
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []
        occs = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        occ = torch.zeros(b_size, 1, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
            else:
                flow = upsample2d_as(flow, x1, mode="bilinear")
                occ = upsample2d_as(occ, x1, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow, height_im, width_im, self._div_flow)

            # correlation  
            out_corr = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_relu = self.leakyRELU(out_corr)

            # flow estimator
            if l == 0:
                x_intm, flow = self.flow_estimators[l](out_corr_relu)
                x_intm_occ, occ= self.occ_estimators[l](out_corr_relu)

            else:
                x_intm, flow = self.flow_estimators[l](torch.cat([out_corr_relu, x1, flow], dim=1))
                x_intm_occ, occ = self.occ_estimators[l](torch.cat([out_corr_relu, x1, occ], dim=1))

            # upsampling or post-processing
            if l != self.output_level:
                flows.append(flow)
                occs.append(occ)
            else:
                flow_fine = self.context_networks(torch.cat([x_intm, flow], dim=1))
                flow = flow + flow_fine
                flows.append(flow)

                occ_fine = self.context_networks_occ(torch.cat([x_intm_occ, occ], dim=1))
                occ = occ + occ_fine
                occs.append(occ)
                break

        output_dict['flow'] = flows
        output_dict['occ'] = occs

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            output_dict_eval['flow'] = upsample2d_as(flow, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
            output_dict_eval['occ'] = upsample2d_as(occ, x1_raw, mode="bilinear")
            return output_dict_eval


================================================
FILE: models/pwcnet_occ_bi.py
================================================
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn

from .pwc_modules import upsample2d_as, initialize_msra, compute_cost_volume
from .pwc_modules import WarpingLayer, FeatureExtractor, FlowEstimatorDense, ContextNetwork, OccEstimatorDense, OccContextNetwork

class PWCNet(nn.Module):
    def __init__(self, args, div_flow=0.05):
        super(PWCNet, self).__init__()
        self.args = args
        self._div_flow = div_flow
        self.search_range = 4
        self.num_chs = [3, 16, 32, 64, 96, 128, 196]
        self.output_level = 4
        self.num_levels = 7
        self.leakyRELU = nn.LeakyReLU(0.1, inplace=True)

        self.feature_pyramid_extractor = FeatureExtractor(self.num_chs)
        self.warping_layer = WarpingLayer()

        self.flow_estimators = nn.ModuleList()
        self.occ_estimators = nn.ModuleList()
        self.dim_corr = (self.search_range * 2 + 1) ** 2
        for l, ch in enumerate(self.num_chs[::-1]):
            if l > self.output_level:
                break

            if l == 0:
                num_ch_in = self.dim_corr
                num_ch_in_occ = self.dim_corr
            else:
                num_ch_in = self.dim_corr + ch + 2
                num_ch_in_occ = self.dim_corr + ch + 1

            layer = FlowEstimatorDense(num_ch_in)
            layer_occ = OccEstimatorDense(num_ch_in_occ)
            self.flow_estimators.append(layer)
            self.occ_estimators.append(layer_occ)

        self.context_networks = ContextNetwork(self.dim_corr + 32 + 2 + 448 + 2)
        self.context_networks_occ = OccContextNetwork(self.dim_corr + 32 + 1 + 448 + 1)
        
        self.corr_params = {"pad_size": self.search_range, "kernel_size": 1, "max_disp": self.search_range, "stride1": 1, "stride2": 1, "corr_multiply": 1}
        
        initialize_msra(self.modules())

    def forward(self, input_dict):

        x1_raw = input_dict['input1']
        x2_raw = input_dict['input2']
        _, _, height_im, width_im = x1_raw.size()

        # on the bottom level are original images
        x1_pyramid = self.feature_pyramid_extractor(x1_raw) + [x1_raw]
        x2_pyramid = self.feature_pyramid_extractor(x2_raw) + [x2_raw]

        # outputs
        output_dict = {}
        flows = []
        occs = []

        # init
        b_size, _, h_x1, w_x1, = x1_pyramid[0].size()
        init_dtype = x1_pyramid[0].dtype
        init_device = x1_pyramid[0].device
        flow_f = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        flow_b = torch.zeros(b_size, 2, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        occ_f = torch.zeros(b_size, 1, h_x1, w_x1, dtype=init_dtype, device=init_device).float()
        occ_b = torch.zeros(b_size, 1, h_x1, w_x1, dtype=init_dtype, device=init_device).float()

        for l, (x1, x2) in enumerate(zip(x1_pyramid, x2_pyramid)):

            # warping
            if l == 0:
                x2_warp = x2
                x1_warp = x1
            else:
                flow_f = upsample2d_as(flow_f, x1, mode="bilinear")
                flow_b = upsample2d_as(flow_b, x2, mode="bilinear")
                occ_f = upsample2d_as(occ_f, x1, mode="bilinear")
                occ_b = upsample2d_as(occ_b, x2, mode="bilinear")
                x2_warp = self.warping_layer(x2, flow_f, height_im, width_im, self._div_flow)
                x1_warp = self.warping_layer(x1, flow_b, height_im, width_im, self._div_flow)

            # correlation
            out_corr_f = compute_cost_volume(x1, x2_warp, self.corr_params)
            out_corr_b = compute_cost_volume(x2, x1_warp, self.corr_params)
            out_corr_relu_f = self.leakyRELU(out_corr_f)
            out_corr_relu_b = self.leakyRELU(out_corr_b)

            # flow estimator
            if l == 0:
                x_intm_f, flow_f = self.flow_estimators[l](out_corr_relu_f)
                x_intm_b, flow_b = self.flow_estimators[l](out_corr_relu_b)
                x_intm_occ_f, occ_f = self.occ_estimators[l](out_corr_relu_f)
                x_intm_occ_b, occ_b = self.occ_estimators[l](out_corr_relu_b)
            else:
                x_intm_f, flow_f = self.flow_estimators[l](torch.cat([out_corr_relu_f, x1, flow_f], dim=1))
                x_intm_b, flow_b = self.flow_estimators[l](torch.cat([out_corr_relu_b, x2, flow_b], dim=1))
                x_intm_occ_f, occ_f = self.occ_estimators[l](torch.cat([out_corr_relu_f, x1, occ_f], dim=1))
                x_intm_occ_b, occ_b = self.occ_estimators[l](torch.cat([out_corr_relu_b, x1, occ_b], dim=1))

            # upsampling or post-processing
            if l != self.output_level:
                flows.append([flow_f, flow_b])
                occs.append([occ_f, occ_b])
            else:
                flow_fine_f = self.context_networks(torch.cat([x_intm_f, flow_f], dim=1))
                flow_fine_b = self.context_networks(torch.cat([x_intm_b, flow_b], dim=1))
                flow_f = flow_f + flow_fine_f
                flow_b = flow_b + flow_fine_b
                flows.append([flow_f, flow_b])

                occ_fine_f = self.context_networks_occ(torch.cat([x_intm_occ_f, occ_f], dim=1))
                occ_fine_b = self.context_networks_occ(torch.cat([x_intm_occ_b, occ_b], dim=1))
                occ_f = occ_f + occ_fine_f
                occ_b = occ_b + occ_fine_b
                occs.append([occ_f, occ_b])                
                break

        output_dict['flow'] = flows
        output_dict['occ'] = occs

        if self.training:
            return output_dict
        else:
            output_dict_eval = {}
            output_dict_eval['flow'] = upsample2d_as(flow_f, x1_raw, mode="bilinear") * (1.0 / self._div_flow)
            output_dict_eval['occ'] = upsample2d_as(occ_f, x1_raw, mode="bilinear")
            return output_dict_eval


================================================
FILE: optim/__init__.py
================================================
import torch
import sys
from tools import module_classes_to_dict

# ------------------------------------------------------------------------------------
# Export PyTorch optimizer
# ------------------------------------------------------------------------------------
_this = sys.modules[__name__]
_optimizer_classes = module_classes_to_dict(torch.optim, exclude_classes="Optimizer")
for name, constructor in _optimizer_classes.items():
    setattr(_this, name, constructor)
__all__ = _optimizer_classes.keys()


================================================
FILE: runtime.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import numpy as np
import colorama
import logging
import logger
import tools
from tools import MovingAverage
import collections

import scipy.misc
import torch
import torch.nn as nn
import os

# for evaluation
from utils.flow import flow_to_png, flow_to_png_middlebury
from utils.flow import write_flow, write_flow_png

# --------------------------------------------------------------------------------
# Exponential moving average smoothing factor for speed estimates
# Ranges from 0 (average speed) to 1 (current/instantaneous speed) [default: 0.3].
# --------------------------------------------------------------------------------
TQDM_SMOOTHING = 0


# -------------------------------------------------------------------------------------------
# Magic progressbar for inputs of type 'iterable'
# -------------------------------------------------------------------------------------------
def create_progressbar(iterable,
                       desc="",
                       train=False,
                       unit="it",
                       initial=0,
                       offset=0,
                       invert_iterations=False,
                       logging_on_update=False,
                       logging_on_close=True,
                       postfix=False):

    # ---------------------------------------------------------------
    # Pick colors
    # ---------------------------------------------------------------
    reset = colorama.Style.RESET_ALL
    bright = colorama.Style.BRIGHT
    cyan = colorama.Fore.CYAN
    dim = colorama.Style.DIM
    green = colorama.Fore.GREEN

    # ---------------------------------------------------------------
    # Specify progressbar layout:
    #   l_bar, bar, r_bar, n, n_fmt, total, total_fmt, percentage,
    #   rate, rate_fmt, rate_noinv, rate_noinv_fmt, rate_inv,
    #   rate_inv_fmt, elapsed, remaining, desc, postfix.
    # ---------------------------------------------------------------
    bar_format = ""
    bar_format += "%s==>%s%s {desc}:%s " % (cyan, reset, bright, reset)     # description
    bar_format += "{percentage:3.0f}%"                                      # percentage
    bar_format += "%s|{bar}|%s " % (dim, reset)                             # bar
    bar_format += " {n_fmt}/{total_fmt}  "                                  # i/n counter
    bar_format += "{elapsed}<{remaining}"                                   # eta
    if invert_iterations:
        bar_format += " {rate_inv_fmt}  "                                   # iteration timings
    else:
        bar_format += " {rate_noinv_fmt}  "
    bar_format += "%s{postfix}%s" % (green, reset)                          # postfix

    # ---------------------------------------------------------------
    # Specify TQDM arguments
    # ---------------------------------------------------------------
    tqdm_args = {
        "iterable": iterable,
        "desc": desc,                          # Prefix for the progress bar
        "total": len(iterable),                # The number of expected iterations
        "leave": True,                         # Leave progress bar when done
        "miniters": 1 if train else None,      # Minimum display update interval in iterations
        "unit": unit,                          # String be used to define the unit of each iteration
        "initial": initial,                    # The initial counter value.
        "dynamic_ncols": True,                 # Allow window resizes
        "smoothing": TQDM_SMOOTHING,           # Moving average smoothing factor for speed estimates
        "bar_format": bar_format,              # Specify a custom bar string formatting
        "position": offset,                    # Specify vertical line offset
        "ascii": True,
        "logging_on_update": logging_on_update,
        "logging_on_close": logging_on_close
    }

    return tools.tqdm_with_logging(**tqdm_args)


def tensor2float_dict(tensor_dict):
    return {key: tensor.item() for key, tensor in tensor_dict.items()}


def format_moving_averages_as_progress_dict(moving_averages_dict={},
                                            moving_averages_postfix="avg"):
    progress_dict = collections.OrderedDict([
        (key + moving_averages_postfix, "%1.4f" % moving_averages_dict[key].mean())
        for key in sorted(moving_averages_dict.keys())
    ])
    return progress_dict


def format_learning_rate(lr):
    if np.isscalar(lr):
        return "{}".format(lr)
    else:
        return "{}".format(str(lr[0]) if len(lr) == 1 else lr)


class TrainingEpoch:
    def __init__(self,
                 args,
                 model_and_loss,
                 loader,
                 optimizer,
                 augmentation=None,
                 add_progress_stats={},
                 desc="Training Epoch"):

        self._args = args
        self._desc = desc
        self._loader = loader
        self._model_and_loss = model_and_loss
        self._optimizer = optimizer
        self._augmentation = augmentation
        self._add_progress_stats = add_progress_stats

    def _step(self, example_dict):

        # -------------------------------------------------------------
        # Get input and target tensor keys
        # -------------------------------------------------------------
        input_keys = list(filter(lambda x: "input" in x, example_dict.keys()))
        target_keys = list(filter(lambda x: "target" in x, example_dict.keys()))
        tensor_keys = input_keys + target_keys

        # -------------------------------------------------------------
        # Possibly transfer to Cuda
        # -------------------------------------------------------------
        if self._args.cuda:
            for key, value in example_dict.items():
                if key in tensor_keys:
                    example_dict[key] = value.cuda(non_blocking=False)

        # -------------------------------------------------------------
        # Optionally perform augmentations
        # -------------------------------------------------------------
        if self._augmentation is not None:
            with torch.no_grad():
                example_dict = self._augmentation(example_dict)

        # -------------------------------------------------------------
        # Convert inputs/targets to variables that require gradients
        # -------------------------------------------------------------
        for key, tensor in example_dict.items():
            if key in input_keys:
                example_dict[key] = tensor.requires_grad_(True)
            elif key in target_keys:
                example_dict[key] = tensor.requires_grad_(False)

        # -------------------------------------------------------------
        # Extract batch size from first input
        # -------------------------------------------------------------
        batch_size = example_dict["input1"].size()[0]

        # -------------------------------------------------------------
        # Reset gradients
        # -------------------------------------------------------------
        self._optimizer.zero_grad()

        # -------------------------------------------------------------
        # Run forward pass to get losses and outputs.
        # -------------------------------------------------------------
        loss_dict, output_dict = self._model_and_loss(example_dict)

        # -------------------------------------------------------------
        # Check total_loss for NaNs
        # -------------------------------------------------------------
        training_loss = loss_dict[self._args.training_key]
        assert (not np.isnan(training_loss.item())), "training_loss is NaN"

        # -------------------------------------------------------------
        # Back propagation
        # -------------------------------------------------------------
        training_loss.backward()
        self._optimizer.step()

        # -------------------------------------------------------------
        # Return success flag, loss and output dictionary
        # -------------------------------------------------------------
        return loss_dict, output_dict, batch_size

    def run(self, offset=0):
        # ---------------------------------------
        # Tell model that we want to train
        # ---------------------------------------
        self._model_and_loss.train()

        # ---------------------------------------
        # Keep track of moving averages
        # ---------------------------------------
        moving_averages_dict = None

        # ---------------------------------------
        # Progress bar arguments
        # ---------------------------------------
        progressbar_args = {
            "iterable": self._loader,
            "desc": self._desc,
            "train": True,
            "offset": offset,
            "logging_on_update": False,
            "logging_on_close": True,
            "postfix": True
        }

        # ---------------------------------------
        # Perform training steps
        # ---------------------------------------
        with create_progressbar(**progressbar_args) as progress:
            for example_dict in progress:
                # perform step
                loss_dict_per_step, output_dict, batch_size = self._step(example_dict)
                # convert
                loss_dict_per_step = tensor2float_dict(loss_dict_per_step)

                # --------------------------------------------------------
                # Possibly initialize moving averages
                # --------------------------------------------------------
                if moving_averages_dict is None:
                    moving_averages_dict = {
                        key: MovingAverage() for key in loss_dict_per_step.keys()
                    }

                # --------------------------------------------------------
                # Add moving averages
                # --------------------------------------------------------
                for key, loss in loss_dict_per_step.items():
                    moving_averages_dict[key].add_average(loss, addcount=batch_size)

                # view statistics in progress bar
                progress_stats = format_moving_averages_as_progress_dict(
                    moving_averages_dict=moving_averages_dict,
                    moving_averages_postfix="_ema")

                progress.set_postfix(progress_stats)

        # -------------------------------------------------------------
        # Return loss and output dictionary
        # -------------------------------------------------------------
        ema_loss_dict = { key: ma.mean() for key, ma in moving_averages_dict.items() }
        return ema_loss_dict


class EvaluationEpoch:
    def __init__(self,
                 args,
                 model_and_loss,
                 loader,
                 augmentation=None,
                 add_progress_stats={},
                 desc="Evaluation Epoch"):
        self._args = args
        self._desc = desc
        self._loader = loader
        self._model_and_loss = model_and_loss
        self._add_progress_stats = add_progress_stats
        self._augmentation = augmentation
        self._save_output = False
        if self._args.save_result_img or self._args.save_result_flo or self._args.save_result_png:
            self._save_output = True

    def save_outputs(self, example_dict, output_dict):

        # save occ
        save_root_img = self._args.save + '/img/'
        save_root_flo = self._args.save + '/flo/'

        if self._args.save_result_bidirection:
            flow_f = output_dict["flow"].data.cpu().numpy()
            flow_b = output_dict["flow_b"].data.cpu().numpy()
            b_size = output_dict["flow"].data.size(0)
        else:
            flow_f = output_dict["flow"].data.cpu().numpy()
            b_size = output_dict["flow"].data.size(0)

        if self._args.save_result_occ:
            if self._args.save_result_bidirection:
                output_occ = np.round(
                    nn.Sigmoid()(output_dict["occ"]).expand(-1, 3, -1, -1).data.cpu().numpy().transpose(
                        [0, 2, 3, 1])) * 255
                output_occ_b = np.round(
                    nn.Sigmoid()(output_dict["occ_b"]).expand(-1, 3, -1, -1).data.cpu().numpy().transpose(
                        [0, 2, 3, 1])) * 255
            else:
                output_occ = np.round(
                    nn.Sigmoid()(output_dict["occ"]).expand(-1, 3, -1, -1).data.cpu().numpy().transpose(
                        [0, 2, 3, 1])) * 255

        # file names
        file_names_img = []
        file_names_flo = []
        for ii in range(0, b_size):
            if "basedir" in  example_dict.keys():
                file_name_img = save_root_img + example_dict["basedir"][ii] + '/' + str(example_dict["basename"][ii])
                file_name_flo = save_root_flo + example_dict["basedir"][ii] + '/' + str(example_dict["basename"][ii])
                file_names_img.append(file_name_img)
                file_names_flo.append(file_name_flo)
            else:
                file_name_img = save_root_img + '/' + str(example_dict["basename"][ii])
                file_name_flo = save_root_flo + '/' + str(example_dict["basename"][ii])
                file_names_img.append(file_name_img)
                file_names_flo.append(file_name_flo)

            directory_img = os.path.dirname(file_name_img)
            if not os.path.exists(directory_img):
                os.makedirs(directory_img)
            directory_flo = os.path.dirname(file_name_flo)
            if not os.path.exists(directory_flo):
                os.makedirs(directory_flo)

        if self._args.save_result_img:
            for ii in range(0, b_size):
                if self._args.save_result_occ:
                    file_name_occ = file_names_img[ii] + '_occ.png'
                    scipy.misc.imsave(file_name_occ, output_occ[ii])

                    if self._args.save_result_bidirection:
                        scipy.misc.imsave(file_names_img[ii] + '_occ_b.png', output_occ_b[ii])

                # flow vis
                flow_f_rgb = flow_to_png_middlebury(flow_f[ii, ...])
                file_name_flo_vis = file_names_img[ii] + '_flow.png'
                scipy.misc.imsave(file_name_flo_vis, flow_f_rgb)

                if self._args.save_result_bidirection:
                    flow_b_rgb = flow_to_png_middlebury(flow_b[ii, ...])
                    file_name_flo_vis = file_names_img[ii] + '_flow_b.png'
                    scipy.misc.imsave(file_name_flo_vis, flow_b_rgb)

        if self._args.save_result_flo or self._args.save_result_png:
            for ii in range(0, b_size):
                if self._args.save_result_flo:
                    file_name = file_names_flo[ii] + '.flo'
                    write_flow(file_name, flow_f[ii, ...].swapaxes(0, 1).swapaxes(1, 2))
                if self._args.save_result_png:
                    file_name = file_names_flo[ii] + '.png'
                    write_flow_png(file_name, flow_f[ii, ...].swapaxes(0, 1).swapaxes(1, 2))


    def _step(self, example_dict):
        # -------------------------------------------------------------
        # Get input and target tensor keys
        # -------------------------------------------------------------
        input_keys = list(filter(lambda x: "input" in x, example_dict.keys()))
        target_keys = list(filter(lambda x: "target" in x, example_dict.keys()))
        tensor_keys = input_keys + target_keys

        # -------------------------------------------------------------
        # Possibly transfer to Cuda
        # -------------------------------------------------------------
        if self._args.cuda:
            for key, value in example_dict.items():
                if key in tensor_keys:
                    example_dict[key] = value.cuda(non_blocking=False)

        # -------------------------------------------------------------
        # Optionally perform augmentations
        # -------------------------------------------------------------
        if self._augmentation is not None:
            example_dict = self._augmentation(example_dict)

        # -------------------------------------------------------------
        # Extract batch size from first input
        # -------------------------------------------------------------
        batch_size = example_dict["input1"].size()[0]

        # -------------------------------------------------------------
        # Run forward pass to get losses and outputs.
        # -------------------------------------------------------------
        loss_dict, output_dict = self._model_and_loss(example_dict)

        # -------------------------------------------------------------
        # Return loss and output dictionary
        # -------------------------------------------------------------
        return loss_dict, output_dict, batch_size

    def run(self, offset=0):

        with torch.no_grad():

            # ---------------------------------------
            # Tell model that we want to evaluate
            # ---------------------------------------
            self._model_and_loss.eval()

            # ---------------------------------------
            # Keep track of moving averages
            # ---------------------------------------
            moving_averages_dict = None

            # ---------------------------------------
            # Progress bar arguments
            # ---------------------------------------
            progressbar_args = {
                "iterable": self._loader,
                "desc": self._desc,
                "train": False,
                "offset": offset,
                "logging_on_update": False,
                "logging_on_close": True,
                "postfix": True
            }

            # ---------------------------------------
            # Perform evaluation steps
            # ---------------------------------------
            with create_progressbar(**progressbar_args) as progress:
                for example_dict in progress:

                    # ---------------------------------------
                    # Perform forward evaluation step
                    # ---------------------------------------
                    loss_dict_per_step, output_dict, batch_size = self._step(example_dict)

                    # --------------------------------------------------------
                    # Save results
                    # --------------------------------------------------------
                    if self._save_output:
                        self.save_outputs(example_dict, output_dict)

                    # ---------------------------------------
                    # Convert loss dictionary to float
                    # ---------------------------------------
                    loss_dict_per_step = tensor2float_dict(loss_dict_per_step)

                    # --------------------------------------------------------
                    # Possibly initialize moving averages
                    # --------------------------------------------------------
                    if moving_averages_dict is None:
                        moving_averages_dict = {
                            key: MovingAverage() for key in loss_dict_per_step.keys()
                        }

                    # --------------------------------------------------------
                    # Add moving averages
                    # --------------------------------------------------------
                    for key, loss in loss_dict_per_step.items():
                        moving_averages_dict[key].add_average(loss, addcount=batch_size)

                    # view statistics in progress bar
                    progress_stats = format_moving_averages_as_progress_dict(
                        moving_averages_dict=moving_averages_dict,
                        moving_averages_postfix="_avg")

                    progress.set_postfix(progress_stats)

            # -------------------------------------------------------------
            # Record average losses
            # -------------------------------------------------------------
            avg_loss_dict = { key: ma.mean() for key, ma in moving_averages_dict.items() }

            # -------------------------------------------------------------
            # Return average losses and output dictionary
            # -------------------------------------------------------------
            return avg_loss_dict


def exec_runtime(args,
                 checkpoint_saver,
                 model_and_loss,
                 optimizer,
                 lr_scheduler,
                 train_loader,
                 validation_loader,
                 inference_loader,
                 training_augmentation,
                 validation_augmentation):

    # ----------------------------------------------------------------------------------------------
    # Validation schedulers are a bit special:
    # They want to be called with a validation loss..
    # ----------------------------------------------------------------------------------------------
    validation_scheduler = (lr_scheduler is not None and args.lr_scheduler == "ReduceLROnPlateau")

    # --------------------------------------------------------
    # Log some runtime info
    # --------------------------------------------------------
    with logger.LoggingBlock("Runtime", emph=True):
        logging.info("start_epoch: %i" % args.start_epoch)
        logging.info("total_epochs: %i" % args.total_epochs)

    # ---------------------------------------
    # Total progress bar arguments
    # ---------------------------------------
    progressbar_args = {
        "desc": "Progress",
        "initial": args.start_epoch - 1,
        "invert_iterations": True,
        "iterable": range(1, args.total_epochs + 1),
        "logging_on_close": True,
        "logging_on_update": True,
        "postfix": False,
        "unit": "ep"
    }

    # --------------------------------------------------------
    # Total progress bar
    # --------------------------------------------------------
    print(''), logging.logbook('')
    total_progress = create_progressbar(**progressbar_args)
    print("\n")

    # --------------------------------------------------------
    # Remember validation loss
    # --------------------------------------------------------
    best_validation_loss = float("inf") if args.validation_key_minimize else -float("inf")
    store_as_best = False

    for epoch in range(args.start_epoch, args.total_epochs + 1):
        with logger.LoggingBlock("Epoch %i/%i" % (epoch, args.total_epochs), emph=True):

            # Always report learning rate
            if lr_scheduler is not None:
                logging.info("lr: %s" % format_learning_rate(lr_scheduler.get_last_lr()))

            # -------------------------------------------
            # Create and run a training epoch
            # -------------------------------------------
            if train_loader is not None:
                avg_loss_dict = TrainingEpoch(
                    args,
                    desc="   Train",
                    model_and_loss=model_and_loss,
                    optimizer=optimizer,
                    loader=train_loader,
                    augmentation=training_augmentation).run()

            # -------------------------------------------
            # Create and run a validation epoch
            # -------------------------------------------
            if validation_loader is not None:

                # ---------------------------------------------------
                # Construct holistic recorder for epoch
                # ---------------------------------------------------
                avg_loss_dict = EvaluationEpoch(
                    args,
                    desc="Validate",
                    model_and_loss=model_and_loss,
                    loader=validation_loader,
                    augmentation=validation_augmentation).run()

                # ----------------------------------------------------------------
                # Evaluate whether this is the best validation_loss
                # ----------------------------------------------------------------
                validation_loss = avg_loss_dict[args.validation_key]
                if args.validation_key_minimize:
                    store_as_best = validation_loss < best_validation_loss
                else:
                    store_as_best = validation_loss > best_validation_loss
                if store_as_best:
                    best_validation_loss = validation_loss

            # Update standard learning scheduler
            if lr_scheduler is not None:
                lr_scheduler.step()

            # ----------------------------------------------------------------
            # Also show best loss on total_progress
            # ----------------------------------------------------------------
            total_progress_stats = {
                "best_" + args.validation_key + "_avg": "%1.4f" % best_validation_loss
            }
            total_progress.set_postfix(total_progress_stats)

            # ----------------------------------------------------------------
            # Bump total progress
            # ----------------------------------------------------------------
            total_progress.update()
            print('')

            # ----------------------------------------------------------------
            # Store checkpoint
            # ----------------------------------------------------------------
            if checkpoint_saver is not None:
                checkpoint_saver.save_latest(
                    directory=args.save,
                    model_and_loss=model_and_loss,
                    stats_dict=dict(avg_loss_dict, epoch=epoch),
                    store_as_best=store_as_best)

            # ----------------------------------------------------------------
            # Vertical space between epochs
            # ----------------------------------------------------------------
            print(''), logging.logbook('')
            
    # ----------------------------------------------------------------
    # Finish
    # ----------------------------------------------------------------
    total_progress.close()
    logging.info("Finished.")


================================================
FILE: saved_check_point/pwcnet/IRR-PWC_flyingchairsOcc/checkpoint_best.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_flyingchairsOcc/checkpoint_latest.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_kitti/checkpoint_best.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_kitti/checkpoint_latest.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_sintel/checkpoint_best.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_sintel/checkpoint_latest.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_things3d/checkpoint_best.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/IRR-PWC_things3d/checkpoint_latest.ckpt
================================================
[File too large to display: 24.3 MB]

================================================
FILE: saved_check_point/pwcnet/PWCNet/checkpoint_best.ckpt
================================================
[File too large to display: 33.0 MB]

================================================
FILE: saved_check_point/pwcnet/PWCNet-irr/checkpoint_best.ckpt
================================================
[File too large to display: 12.8 MB]

================================================
FILE: scripts/IRR-FlowNet_flyingChairsOcc.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=IRR_FlowNet
EVAL_LOSS=MultiScaleEPE_FlowNet_IRR_Bi_Occ_upsample
CHECKPOINT=None
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[54, 72, 90]" \
--model=$MODEL \
--num_workers=4 \
--num_iters=2 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=108 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/IRR-PWC_flyingChairsOcc.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=IRR_PWC
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample
CHECKPOINT=None
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[54, 72, 90]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=108 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/IRR-PWC_kitti_train.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
KITTI_HOME=(YOUR PATH)/KITTI_flow/

# model and checkpoint
MODEL=IRR_PWC
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample_KITTI
CHECKPOINT="saved_check_point/IRR-PWC_things3d/checkpoint_latest.ckpt"
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=1 \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[730, 984, 1238, 1365, 1397, 1429, 1556, 1683, 1810, 1937]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=3e-05 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=160 \
--total_epochs=2064 \
--training_augmentation=RandomAffineFlowOccKITTI \
--training_augmentation_crop="[320,896]" \
--training_dataset=KittiCombTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$KITTI_HOME \
--training_dataset_preprocessing_crop=True \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=KittiCombVal  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$KITTI_HOME \
--validation_dataset_preprocessing_crop=False \
--validation_key=epe \
--validation_loss=$EVAL_LOSS


================================================
FILE: scripts/IRR-PWC_kitti_train_full.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
KITTI_HOME=(YOUR PATH)/KITTI_flow/

# model and checkpoint
MODEL=IRR_PWC
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample_KITTI
CHECKPOINT="saved_check_point/IRR-PWC_things3d/checkpoint_latest.ckpt"
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=1 \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[616, 819, 1022, 1123, 1149, 1174, 1276, 1377, 1479, 1580]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=3e-05 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=160 \
--total_epochs=710 \
--training_augmentation=RandomAffineFlowOccKITTI \
--training_augmentation_crop="[320,896]" \
--training_dataset=KittiCombFull \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$KITTI_HOME \
--training_dataset_preprocessing_crop=True \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=KittiCombVal  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$KITTI_HOME \
--validation_dataset_preprocessing_crop=False \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/IRR-PWC_sintel_train.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_PWC
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample_Sintel
CHECKPOINT="saved_check_point/IRR-PWC_things3d/checkpoint_latest.ckpt"
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[258, 302, 346, 368, 374, 379, 401, 423, 445, 467]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1.5e-05 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=160 \
--total_epochs=489 \
--training_augmentation=RandomAffineFlowOccSintel \
--training_augmentation_crop="[384,768]" \
--training_dataset=SintelTrainingCombTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$SINTEL_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=SintelTrainingCombValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[687, 775, 863, 908, 919, 930, 974, 1018, 1062, 1106]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1e-05 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=490 \
--total_epochs=1150 \
--training_augmentation=RandomAffineFlowOccSintel \
--training_augmentation_crop="[384,768]" \
--training_dataset=SintelTrainingFinalTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$SINTEL_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=SintelTrainingFinalValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/IRR-PWC_sintel_train_full.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_PWC
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample_Sintel
CHECKPOINT="saved_check_point/IRR-PWC_things3d/checkpoint_latest.ckpt"
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[245, 284, 322, 342, 346, 351, 370, 390, 409, 428]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1.5e-05 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=160 \
--total_epochs=447 \
--training_augmentation=RandomAffineFlowOccSintel \
--training_augmentation_crop="[384,768]" \
--training_dataset=SintelTrainingCombFull \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$SINTEL_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=SintelTrainingCombValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[620, 697, 774, 812, 822, 831, 870, 908, 947, 985]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1e-05 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=448 \
--total_epochs=591 \
--training_augmentation=RandomAffineFlowOccSintel \
--training_augmentation_crop="[384,768]" \
--training_dataset=SintelTrainingFinalFull \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$SINTEL_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=SintelTrainingFinalValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/IRR-PWC_things3d.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGTHINGS_HOME=(YOUR PATH)/things3d/FlyingThings3D_subset/
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_PWC
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample
CHECKPOINT="saved_check_point/IRR-PWC_flyingchairsOcc/checkpoint_latest.ckpt"
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[128, 139, 149]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1e-5 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--start_epoch=109 \
--total_epochs=159 \
--training_augmentation=RandomAffineFlowOcc \
--training_augmentation_crop="[384,768]" \
--training_dataset=FlyingThings3dCleanTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGTHINGS_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/flownet1s.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=FlowNet1S
EVAL_LOSS=MultiScaleEPE_FlowNet
CHECKPOINT=None
SIZE_OF_BATCH=8

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[108, 144, 180]" \
--model=$MODEL \
--num_workers=4 \
--num_iters=1 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=216 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/flownet1s_irr1.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=FlowNet1S_irr
EVAL_LOSS=MultiScaleEPE_FlowNet_IRR
CHECKPOINT=None
SIZE_OF_BATCH=8

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[108, 144, 180]" \
--model=$MODEL \
--num_workers=4 \
--num_iters=1 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=216 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/flownet1s_irr2.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=FlowNet1S_irr
EVAL_LOSS=MultiScaleEPE_FlowNet_IRR
CHECKPOINT=None
SIZE_OF_BATCH=4

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[54, 72, 90]" \
--model=$MODEL \
--num_workers=4 \
--num_iters=2 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=108 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/pwcnet.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=PWCNet
EVAL_LOSS=MultiScaleEPE_PWC
CHECKPOINT=None
SIZE_OF_BATCH=8

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[108, 144, 180]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=216 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/pwcnet_irr.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="experiments"

# datasets
FLYINGCHAIRS_OCC_HOME=(YOUR PATH)/flow_occ_v5/data

# model and checkpoint
MODEL=PWCNet_irr
EVAL_LOSS=MultiScaleEPE_PWC
CHECKPOINT=None
SIZE_OF_BATCH=8

# save path
TIME=$(date +"%Y%m%d-%H%M%S")
SAVE_PATH="$EXPERIMENTS_HOME/$MODEL-$TIME"

# training configuration
python ../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--lr_scheduler=MultiStepLR \
--lr_scheduler_gamma=0.5 \
--lr_scheduler_milestones="[108, 144, 180]" \
--model=$MODEL \
--num_workers=4 \
--optimizer=Adam \
--optimizer_lr=1e-4 \
--optimizer_weight_decay=4e-4 \
--save=$SAVE_PATH \
--total_epochs=216 \
--training_augmentation=RandomAffineFlowOcc \
--training_dataset=FlyingChairsOccTrain \
--training_dataset_photometric_augmentations=True \
--training_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--training_key=total_loss \
--training_loss=$EVAL_LOSS \
--validation_dataset=FlyingChairsOccValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$FLYINGCHAIRS_OCC_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/IRR-FlowNet_flyingChairs.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/flownet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_FlowNet
CHECKPOINT="$EXPERIMENTS_HOME/IRR-FlowNet_flyingChairs/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_FlowNet_IRR_Bi_Occ_upsample

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--num_iters=2 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/IRR-PWC_flyingChairs.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/pwcnet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_PWC
CHECKPOINT="$EXPERIMENTS_HOME/IRR-PWC_flyingchairsOcc/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/IRR-PWC_kitti.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/pwcnet"

# datasets
KITTI_HOME=(YOUR PATH)/KITTI_flow/

# model and checkpoint
MODEL=IRR_PWC
CHECKPOINT="$EXPERIMENTS_HOME/IRR-PWC_kitti/checkpoint_latest.ckpt"
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample_KITTI

SIZE_OF_BATCH=1

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--save=$SAVE_PATH \
--validation_dataset=KittiCombVal  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$KITTI_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/IRR-PWC_sintel.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/pwcnet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_PWC
CHECKPOINT="$EXPERIMENTS_HOME/IRR-PWC_sintel/checkpoint_latest.ckpt"
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample_Sintel

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingFinalValid  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/IRR-PWC_things3d.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/pwcnet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=IRR_PWC
CHECKPOINT="$EXPERIMENTS_HOME/IRR-PWC_things3d/checkpoint_latest.ckpt"
EVAL_LOSS=MultiScaleEPE_PWC_Bi_Occ_upsample

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/flownet1s.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/flownet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=FlowNet1S
CHECKPOINT="$EXPERIMENTS_HOME/FlowNet1S/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_FlowNet

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--num_iters=1 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/flownet1s_irr1.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/flownet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=FlowNet1S_irr
CHECKPOINT="$EXPERIMENTS_HOME/FlowNet1S-irr1/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_FlowNet_IRR

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--num_iters=1 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/flownet1s_irr2.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/flownet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=FlowNet1S_irr
CHECKPOINT="$EXPERIMENTS_HOME/FlowNet1S-irr2/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_FlowNet_IRR

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--num_iters=2 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/pwcnet.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/pwcnet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=PWCNet
CHECKPOINT="$EXPERIMENTS_HOME/PWCNet/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_PWC

SIZE_OF_BATCH=1

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: scripts/validation/pwcnet_irr.sh
================================================
#!/bin/bash

# experiments and datasets meta
EXPERIMENTS_HOME="saved_check_point/pwcnet"

# datasets
SINTEL_HOME=(YOUR PATH)/MPI-Sintel-complete/

# model and checkpoint
MODEL=PWCNet_irr
CHECKPOINT="$EXPERIMENTS_HOME/PWCNet-irr/checkpoint_best.ckpt"
EVAL_LOSS=MultiScaleEPE_PWC

SIZE_OF_BATCH=4

# validate clean configuration
SAVE_PATH="$EXPERIMENTS_HOME/eval_temp/$MODEL"
python ../../main.py \
--batch_size=$SIZE_OF_BATCH \
--batch_size_val=$SIZE_OF_BATCH \
--checkpoint=$CHECKPOINT \
--evaluation=True \
--model=$MODEL \
--num_workers=4 \
--save=$SAVE_PATH \
--validation_dataset=SintelTrainingCleanFull  \
--validation_dataset_photometric_augmentations=False \
--validation_dataset_root=$SINTEL_HOME \
--validation_key=epe \
--validation_loss=$EVAL_LOSS

================================================
FILE: tools.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import os
import socket
import re
from pytz import timezone
from datetime import datetime
import fnmatch
import itertools
import argparse
import sys
import six
import unicodedata
import json
import inspect
import tqdm
import logging
import torch
import ast
import numpy as np


def x2module(module_or_data_parallel):
    if isinstance(module_or_data_parallel, torch.nn.DataParallel):
        return module_or_data_parallel.module
    else:
        return module_or_data_parallel


# ----------------------------------------------------------------------------------------
# Comprehensively adds a new logging level to the `logging` module and the
# currently configured logging class.
# e.g. addLoggingLevel('TRACE', logging.DEBUG - 5)
# ----------------------------------------------------------------------------------------
def addLoggingLevel(level_name, level_num, method_name=None):
    if not method_name:
        method_name = level_name.lower()
    if hasattr(logging, level_name):
        raise AttributeError('{} already defined in logging module'.format(level_name))
    if hasattr(logging, method_name):
        raise AttributeError('{} already defined in logging module'.format(method_name))
    if hasattr(logging.getLoggerClass(), method_name):
        raise AttributeError('{} already defined in logger class'.format(method_name))

    # This method was inspired by the answers to Stack Overflow post
    # http://stackoverflow.com/q/2183233/2988730, especially
    # http://stackoverflow.com/a/13638084/2988730
    def logForLevel(self, message, *args, **kwargs):
        if self.isEnabledFor(level_num):
            self._log(level_num, message, args, **kwargs)

    def logToRoot(message, *args, **kwargs):
        logging.log(level_num, message, *args, **kwargs)

    logging.addLevelName(level_num, level_name)
    setattr(logging, level_name, level_num)
    setattr(logging.getLoggerClass(), method_name, logForLevel)
    setattr(logging, method_name, logToRoot)


# -------------------------------------------------------------------------------------------------
# Looks for sub arguments in the argument structure.
# Retrieve sub arguments for modules such as optimizer_*
# -------------------------------------------------------------------------------------------------
def kwargs_from_args(args, name, exclude=[]):
    if isinstance(exclude, str):
        exclude = [exclude]
    exclude += ["class"]
    args_dict = vars(args)
    name += "_"
    subargs_dict = {
        key[len(name):]: value for key, value in args_dict.items()
        if name in key and all([key != name + x for x in exclude])
    }
    return subargs_dict


# -------------------------------------------------------------------------------------------------
# Create class instance from kwargs dictionary.
# Filters out keys that not in the constructor
# -------------------------------------------------------------------------------------------------
def instance_from_kwargs(class_constructor, kwargs):
    argspec = inspect.getargspec(class_constructor.__init__)
    full_args = argspec.args
    filtered_args = dict([(k,v) for k,v in kwargs.items() if k in full_args])
    instance = class_constructor(**filtered_args)
    return instance


def module_classes_to_dict(module, include_classes="*", exclude_classes=()):
    # -------------------------------------------------------------------------
    # If arguments are strings, convert them to a list
    # -------------------------------------------------------------------------
    if include_classes is not None:
        if isinstance(include_classes, str):
            include_classes = [include_classes]

    if exclude_classes is not None:
        if isinstance(exclude_classes, str):
            exclude_classes = [exclude_classes]

    # -------------------------------------------------------------------------
    # Obtain dictionary from given module
    # -------------------------------------------------------------------------
    item_dict = dict([(name, getattr(module, name)) for name in dir(module)])

    # -------------------------------------------------------------------------
    # Filter classes
    # -------------------------------------------------------------------------
    item_dict = dict([
        (name,value) for name, value in item_dict.items() if inspect.isclass(getattr(module, name))
    ])

    filtered_keys = filter_list_of_strings(
        item_dict.keys(), include=include_classes, exclude=exclude_classes)

    # -------------------------------------------------------------------------
    # Construct dictionary from matched results
    # -------------------------------------------------------------------------
    result_dict = dict([(name, value) for name, value in item_dict.items() if name in filtered_keys])

    return result_dict


def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)


def search_and_replace(string, regex, replace):
    while True:
        match = re.search(regex, string)
        if match:
            string = string.replace(match.group(0), replace)
        else:
            break
    return string


def hostname():
    name = socket.gethostname()
    n = name.find('.')
    if n > 0:
        name = name[:n]
    return name


def get_filenames(directory, match='*.*', not_match=()):
    if match is not None:
        if isinstance(match, str):
            match = [match]
    if not_match is not None:
        if isinstance(not_match, str):
            not_match = [not_match]

    result = []
    for dirpath, _, filenames in os.walk(directory):
        filtered_matches = list(itertools.chain.from_iterable(
            [fnmatch.filter(filenames, x) for x in match]))
        filtered_nomatch = list(itertools.chain.from_iterable(
            [fnmatch.filter(filenames, x) for x in not_match]))
        matched = list(set(filtered_matches) - set(filtered_nomatch))
        result += [os.path.join(dirpath, x) for x in matched]
    return result


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def str2str_or_none(v):
    if v.lower() == "none":
        return None
    return v


def str2dict(v):
    return ast.literal_eval(v)


def str2intlist(v):
    return [int(x.strip()) for x in v.strip()[1:-1].split(',')]


def str2list(v):
    return [str(x.strip()) for x in v.strip()[1:-1].split(',')]


def read_json(filename):

    def _convert_from_unicode(data):
        new_data = dict()
        for name, value in six.iteritems(data):
            if isinstance(name, six.string_types):
                name = unicodedata.normalize('NFKD', name).encode(
                    'ascii', 'ignore')
            if isinstance(value, six.string_types):
                value = unicodedata.normalize('NFKD', value).encode(
                    'ascii', 'ignore')
            if isinstance(value, dict):
                value = _convert_from_unicode(value)
            new_data[name] = value
        return new_data

    output_dict = None
    with open(filename, "r") as f:
        lines = f.readlines()
        try:
            output_dict = json.loads(''.join(lines), encoding='utf-8')
        except:
            raise ValueError('Could not read %s. %s' % (filename, sys.exc_info()[1]))
        output_dict = _convert_from_unicode(output_dict)
    return output_dict


def write_json(data_dict, filename):
    with open(filename, "w") as file:
        json.dump(data_dict, file)


def datestr():
    pacific = timezone('US/Pacific')
    now = datetime.now(pacific)
    return '{}{:02}{:02}_{:02}{:02}'.format(now.year, now.month, now.day, now.hour, now.minute)


def filter_list_of_strings(lst, include="*", exclude=()):
    filtered_matches = list(itertools.chain.from_iterable([fnmatch.filter(lst, x) for x in include]))
    filtered_nomatch = list(itertools.chain.from_iterable([fnmatch.filter(lst, x) for x in exclude]))
    matched = list(set(filtered_matches) - set(filtered_nomatch))
    return matched


# ----------------------------------------------------------------------------
# Writes all pairs to a filename for book keeping
# Either .txt or .json
# ----------------------------------------------------------------------------
def write_dictionary_to_file(arguments_dict, filename):
    # ensure dir
    d = os.path.dirname(filename)
    if not os.path.exists(d):
        os.makedirs(d)

    # check for json extension
    ext = os.path.splitext(filename)[1]
    if ext == ".json":

        def replace_quotes(x):
            return x.replace("\'", "\"")

        with open(filename, 'w') as file:
            file.write("{\n")
            for i, (key, value) in enumerate(arguments_dict):
                if isinstance(value, tuple):
                    value = list(value)
                if value is None:
                    file.write("  \"%s\": null" % key)
                elif isinstance(value, str):
                    value = value.replace("\'", "\"")
                    file.write("  \"%s\": \"%s\"" % (key, replace_quotes(str( value))))
                elif isinstance(value, bool):
                    file.write("  \"%s\": %s" % (key, str(value).lower()))
                else:
                    file.write("  \"%s\": %s" % (key, replace_quotes(str(value))))
                if i < len(arguments_dict) - 1:
                    file.write(',\n')
                else:
                    file.write('\n')
            file.write("}\n")
    else:
        with open(filename, 'w') as file:
            for key, value in arguments_dict:
                file.write('%s: %s\n' % (key, value))


class MovingAverage:
    postfix = "avg"

    def __init__(self):
        self._sum = 0.0
        self._count = 0

    def add_value(self, sigma, addcount=1):
        self._sum += sigma
        self._count += addcount

    def add_average(self, avg, addcount):
        self._sum += avg*addcount
        self._count += addcount

    def mean(self):
        return self._sum / self._count


class ExponentialMovingAverage:
    postfix = "ema"

    def __init__(self, alpha=0.7):
        self._weighted_sum = 0.0
        self._weighted_count = 0
        self._alpha = alpha

    def add_value(self, sigma, addcount=1):
        self._weighted_sum = sigma + (1.0 - self._alpha)*self._weighted_sum
        self._weighted_count = 1 + (1.0 - self._alpha)*self._weighted_count

    def add_average(self, avg, addcount):
        self._weighted_sum = avg*addcount + (1.0 - self._alpha)*self._weighted_sum
        self._weighted_count = addcount + (1.0 - self._alpha)*self._weighted_count

    def mean(self):
        return self._weighted_sum / self._weighted_count


# -----------------------------------------------------------------
# Subclass tqdm to achieve two things:
#   1) Output the progress bar into the logbook.
#   2) Remove the comma before {postfix} because it's annoying.
# -----------------------------------------------------------------
class TqdmToLogger(tqdm.tqdm):
    def __init__(self, iterable=None, desc=None, total=None, leave=True,
                 file=None, ncols=None, mininterval=0.1,
                 maxinterval=10.0, miniters=None, ascii=None, disable=False,
                 unit='it', unit_scale=False, dynamic_ncols=False,
                 smoothing=0.3, bar_format=None, initial=0, position=None,
                 postfix=None,
                 logging_on_close=True,
                 logging_on_update=False):

        super(TqdmToLogger, self).__init__(
            iterable=iterable, desc=desc, total=total, leave=leave,
            file=file, ncols=ncols, mininterval=mininterval,
            maxinterval=maxinterval, miniters=miniters, ascii=ascii, disable=disable,
            unit=unit, unit_scale=unit_scale, dynamic_ncols=dynamic_ncols,
            smoothing=smoothing, bar_format=bar_format, initial=initial, position=position,
            postfix=postfix)

        self._logging_on_close = logging_on_close
        self._logging_on_update = logging_on_update
        self._closed = False

    @staticmethod
    def format_meter(n, total, elapsed, ncols=None, prefix='', ascii=False,
                     unit='it', unit_scale=False, rate=None, bar_format=None,
                     postfix=None, unit_divisor=1000):

        meter = tqdm.tqdm.format_meter(
            n=n, total=total, elapsed=elapsed, ncols=ncols, prefix=prefix, ascii=ascii,
            unit=unit, unit_scale=unit_scale, rate=rate, bar_format=bar_format,
            postfix=postfix, unit_divisor=unit_divisor)

        # get rid of that stupid comma before the postfix
        if postfix is not None:
            postfix_with_comma = ", %s" % postfix
            meter = meter.replace(postfix_with_comma, postfix)

        return meter

    def update(self, n=1):
        if self._logging_on_update:
            msg = self.__repr__()
            logging.logbook(msg)
        return super(TqdmToLogger, self).update(n=n)

    def close(self):
        if self._logging_on_close and not self._closed:
            msg = self.__repr__()
            logging.logbook(msg)
            self._closed = True
        return super(TqdmToLogger, self).close()


def tqdm_with_logging(iterable=None, desc=None, total=None, leave=True,
                      ncols=None, mininterval=0.1,
                      maxinterval=10.0, miniters=None, ascii=None, disable=False,
                      unit="it", unit_scale=False, dynamic_ncols=False,
                      smoothing=0.3, bar_format=None, initial=0, position=None,
                      postfix=None,
                      logging_on_close=True,
                      logging_on_update=False):

    return TqdmToLogger(
        iterable=iterable, desc=desc, total=total, leave=leave,
        ncols=ncols, mininterval=mininterval,
        maxinterval=maxinterval, miniters=miniters, ascii=ascii, disable=disable,
        unit=unit, unit_scale=unit_scale, dynamic_ncols=dynamic_ncols,
        smoothing=smoothing, bar_format=bar_format, initial=initial, position=position,
        postfix=postfix,
        logging_on_close=logging_on_close,
        logging_on_update=logging_on_update)


def cd_dotdot(path_or_filename):
    return os.path.abspath(os.path.join(os.path.dirname(path_or_filename), ".."))


def cd_dotdotdot(path_or_filename):
    return os.path.abspath(os.path.join(os.path.dirname(path_or_filename), "../.."))


def cd_dotdotdotdot(path_or_filename):
    return os.path.abspath(os.path.join(os.path.dirname(path_or_filename), "../../.."))


def tensor2numpy(tensor):
    if isinstance(tensor, np.ndarray):
        return tensor
    else:
        if isinstance(tensor, torch.autograd.Variable):
            tensor = tensor.data
        if tensor.dim() == 3:
            return tensor.cpu().numpy().transpose([1,2,0])
        else:
            return tensor.cpu().numpy().transpose([0,2,3,1])


================================================
FILE: utils/__init__.py
================================================


================================================
FILE: utils/flow.py
================================================
from __future__ import absolute_import, division, print_function

import numpy as np
import png
import matplotlib.colors as cl

TAG_CHAR = np.array([202021.25], np.float32)
UNKNOWN_FLOW_THRESH = 1e7


def write_flow(filename, uv, v=None):
    nBands = 2

    if v is None:
        assert (uv.ndim == 3)
        assert (uv.shape[2] == 2)
        u = uv[:, :, 0]
        v = uv[:, :, 1]
    else:
        u = uv

    assert (u.shape == v.shape)
    height, width = u.shape
    f = open(filename, 'wb')
    # write the header
    f.write(TAG_CHAR)
    np.array(width).astype(np.int32).tofile(f)
    np.array(height).astype(np.int32).tofile(f)
    # arrange into matrix form
    tmp = np.zeros((height, width * nBands))
    tmp[:, np.arange(width) * 2] = u
    tmp[:, np.arange(width) * 2 + 1] = v
    tmp.astype(np.float32).tofile(f)
    f.close()


def write_flow_png(filename, uv, v=None, mask=None):

    if v is None:
        assert (uv.ndim == 3)
        assert (uv.shape[2] == 2)
        u = uv[:, :, 0]
        v = uv[:, :, 1]
    else:
        u = uv

    assert (u.shape == v.shape)

    height_img, width_img = u.shape
    if mask is None:
        valid_mask = np.ones([height_img, width_img])
    else:
        valid_mask = mask

    flow_u = np.clip((u * 64 + 2 ** 15), 0.0, 65535.0).astype(np.uint16)
    flow_v = np.clip((v * 64 + 2 ** 15), 0.0, 65535.0).astype(np.uint16)
    
    output = np.stack((flow_u, flow_v, valid_mask), axis=-1)

    with open(filename, 'wb') as f:
        writer = png.Writer(width=width_img, height=height_img, bitdepth=16)
        writer.write(f, np.reshape(output, (-1, width_img*3)))


def flow_to_png(flow_map, max_value=None):
    _, h, w = flow_map.shape    
    rgb_map = np.ones((h, w, 3)).astype(np.float32)
    if max_value is not None:
        normalized_flow_map = flow_map / max_value
    else:
        normalized_flow_map = flow_map / (np.abs(flow_map).max())
    rgb_map[:, :, 0] += normalized_flow_map[0]
    rgb_map[:, :, 1] -= 0.5 * (normalized_flow_map[0] + normalized_flow_map[1])
    rgb_map[:, :, 2] += normalized_flow_map[1]
    return rgb_map.clip(0, 1)


def compute_color(u, v):
    """
    compute optical flow color map
    :param u: optical flow horizontal map
    :param v: optical flow vertical map
    :return: optical flow in color code
    """
    [h, w] = u.shape
    img = np.zeros([h, w, 3])
    nanIdx = np.isnan(u) | np.isnan(v)
    u[nanIdx] = 0
    v[nanIdx] = 0

    colorwheel = make_color_wheel()
    ncols = np.size(colorwheel, 0)

    rad = np.sqrt(u ** 2 + v ** 2)

    a = np.arctan2(-v, -u) / np.pi

    fk = (a + 1) / 2 * (ncols - 1) + 1

    k0 = np.floor(fk).astype(int)

    k1 = k0 + 1
    k1[k1 == ncols + 1] = 1
    f = fk - k0

    for i in range(0, np.size(colorwheel, 1)):
        tmp = colorwheel[:, i]
        col0 = tmp[k0 - 1] / 255
        col1 = tmp[k1 - 1] / 255
        col = (1 - f) * col0 + f * col1

        idx = rad <= 1
        col[idx] = 1 - rad[idx] * (1 - col[idx])
        notidx = np.logical_not(idx)

        col[notidx] *= 0.75
        img[:, :, i] = np.uint8(np.floor(255 * col * (1 - nanIdx)))

    return img


def make_color_wheel():
    """
    Generate color wheel according Middlebury color code
    :return: Color wheel
    """
    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR

    colorwheel = np.zeros([ncols, 3])

    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.transpose(np.floor(255 * np.arange(0, RY) / RY))
    col += RY

    # YG
    colorwheel[col:col + YG, 0] = 255 - np.transpose(np.floor(255 * np.arange(0, YG) / YG))
    colorwheel[col:col + YG, 1] = 255
    col += YG

    # GC
    colorwheel[col:col + GC, 1] = 255
    colorwheel[col:col + GC, 2] = np.transpose(np.floor(255 * np.arange(0, GC) / GC))
    col += GC

    # CB
    colorwheel[col:col + CB, 1] = 255 - np.transpose(np.floor(255 * np.arange(0, CB) / CB))
    colorwheel[col:col + CB, 2] = 255
    col += CB

    # BM
    colorwheel[col:col + BM, 2] = 255
    colorwheel[col:col + BM, 0] = np.transpose(np.floor(255 * np.arange(0, BM) / BM))
    col += + BM

    # MR
    colorwheel[col:col + MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
    colorwheel[col:col + MR, 0] = 255

    return colorwheel


def flow_to_png_middlebury(flow):
    """
    Convert flow into middlebury color code image
    :param flow: optical flow map
    :return: optical flow image in middlebury color
    """

    flow = flow.transpose([1, 2, 0])
    u = flow[:, :, 0]
    v = flow[:, :, 1]

    maxu = -999.
    maxv = -999.
    minu = 999.
    minv = 999.

    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
    u[idxUnknow] = 0
    v[idxUnknow] = 0

    maxu = max(maxu, np.max(u))
    minu = min(minu, np.min(u))

    maxv = max(maxv, np.max(v))
    minv = min(minv, np.min(v))

    rad = np.sqrt(u ** 2 + v ** 2)
    maxrad = max(-1, np.max(rad))

    u = u / (maxrad + np.finfo(float).eps)
    v = v / (maxrad + np.finfo(float).eps)

    img = compute_color(u, v)

    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
    img[idx] = 0

    return np.uint8(img)

================================================
FILE: utils/interpolation.py
================================================
## Portions of Code from, copyright 2018 Jochen Gast

from __future__ import absolute_import, division, print_function

import torch
from torch import nn
import torch.nn.functional as tf


def _bchw2bhwc(tensor):
    return tensor.transpose(1,2).transpose(2,3)


def _bhwc2bchw(tensor):
    return tensor.transpose(2,3).transpose(1,2)


class Meshgrid(nn.Module):
    def __init__(self):
        super(Meshgrid, self).__init__()
        self.width = 0
        self.height = 0
        self.xx = None
        self.yy = None

    def _compute_meshgrid(self, width, height):
        rangex = torch.arange(0, width)
        rangey = torch.arange(0, height)
        self.xx = rangex.repeat(height, 1).contiguous()
        self.yy = rangey.repeat(width, 1).t().contiguous()
        
    def forward(self, width, height, device=None, dtype=None):
        if self.width != width or self.height != height:
            self._compute_meshgrid(width=width, height=height)
            self.width = width
            self.height = height
        self.xx = self.xx.to(device=device, dtype=dtype)
        self.yy = self.yy.to(device=device, dtype=dtype)
        return self.xx, self.yy


class BatchSub2Ind(nn.Module):
    def __init__(self):
        super(BatchSub2Ind, self).__init__()
        self.register_buffer("_offsets", torch.LongTensor())

    def forward(self, shape, row_sub, col_sub, out=None):
        batch_size = row_sub.size(0)
        height, width = shape
        ind = row_sub*width + col_sub
        torch.arange(batch_size, out=self._offsets)
        self._offsets *= (height*width)

        if out is None:
            return torch.add(ind, self._offsets.view(-1,1,1))
        else:
            torch.add(ind, self._offsets.view(-1,1,1), out=out)


class Interp2(nn.Module):
    def __init__(self, clamp=False):
        super(Interp2, self).__init__()
        self._clamp = clamp
        self._batch_sub2ind = BatchSub2Ind()
        self.register_buffer("_x0", torch.LongTensor())
        self.register_buffer("_x1", torch.LongTensor())
        self.register_buffer("_y0", torch.LongTensor())
        self.register_buffer("_y1", torch.LongTensor())
        self.register_buffer("_i00", torch.LongTensor())
        self.register_buffer("_i01", torch.LongTensor())
        self.register_buffer("_i10", torch.LongTensor())
        self.register_buffer("_i11", torch.LongTensor())
        self.register_buffer("_v00", torch.FloatTensor())
        self.register_buffer("_v01", torch.FloatTensor())
        self.register_buffer("_v10", torch.FloatTensor())
        self.register_buffer("_v11", torch.FloatTensor())
        self.register_buffer("_x", torch.FloatTensor())
        self.register_buffer("_y", torch.FloatTensor())

    def forward(self, v, xq, yq):
        batch_size, channels, height, width = v.size()

        # clamp if wanted
        if self._clamp:
            xq.clamp_(0, width - 1)
            yq.clamp_(0, height - 1)

        # ------------------------------------------------------------------
        # Find neighbors
        #
        # x0 = torch.floor(xq).long(),          x0.clamp_(0, width - 1)
        # x1 = x0 + 1,                          x1.clamp_(0, width - 1)
        # y0 = torch.floor(yq).long(),          y0.clamp_(0, height - 1)
        # y1 = y0 + 1,                          y1.clamp_(0, height - 1)
        #
        # ------------------------------------------------------------------
        self._x0 = torch.floor(xq).long().clamp(0, width - 1)
        self._y0 = torch.floor(yq).long().clamp(0, height - 1)

        self._x1 = torch.add(self._x0, 1).clamp(0, width - 1)
        self._y1 = torch.add(self._y0, 1).clamp(0, height - 1)

        # batch_sub2ind
        self._batch_sub2ind([height, width], self._y0, self._x0, out=self._i00)
        self._batch_sub2ind([height, width], self._y0, self._x1, out=self._i01)
        self._batch_sub2ind([height, width], self._y1, self._x0, out=self._i10)
        self._batch_sub2ind([height, width], self._y1, self._x1, out=self._i11)

        # reshape
        v_flat = _bchw2bhwc(v).contiguous().view(-1, channels)
        torch.index_select(v_flat, dim=0, index=self._i00.view(-1), out=self._v00)
        torch.index_select(v_flat, dim=0, index=self._i01.view(-1), out=self._v01)
        torch.index_select(v_flat, dim=0, index=self._i10.view(-1), out=self._v10)
        torch.index_select(v_flat, dim=0, index=self._i11.view(-1), out=self._v11)

        # local_coords
        torch.add(xq, - self._x0.float(), out=self._x)
        torch.add(yq, - self._y0.float(), out=self._y)

        # weights
        w00 = torch.unsqueeze((1.0 - self._y) * (1.0 - self._x), dim=1)
        w01 = torch.unsqueeze((1.0 - self._y) * self._x, dim=1)
        w10 = torch.unsqueeze(self._y * (1.0 - self._x), dim=1)
        w11 = torch.unsqueeze(self._y * self._x, dim=1)

        def _reshape(u):
            return _bhwc2bchw(u.view(batch_size, height, width, channels))

        # values
        values = _reshape(self._v00)*w00 + _reshape(self._v01)*w01 \
            + _reshape(self._v10)*w10 + _reshape(self._v11)*w11

        if self._clamp:
            return values
        else:
            #  find_invalid
            invalid = ((xq < 0) | (xq >= width) | (yq < 0) | (yq >= height)).unsqueeze(dim=1).float()
            # maskout invalid
            transformed = invalid * torch.zeros_like(values) + (1.0 - invalid)*values

        return transformed


class Interp2MaskBinary(nn.Module):
    def __init__(self, clamp=False):
        super(Interp2MaskBinary, self).__init__()
        self._clamp = clamp
        self._batch_sub2ind = BatchSub2Ind()
        self.register_buffer("_x0", torch.LongTensor())
        self.register_buffer("_x1", torch.LongTensor())
        self.register_buffer("_y0", torch.LongTensor())
        self.register_buffer("_y1", torch.LongTensor())
        self.register_buffer("_i00", torch.LongTensor())
        self.register_buffer("_i01", torch.LongTensor())
        self.register_buffer("_i10", torch.LongTensor())
        self.register_buffer("_i11", torch.LongTensor())
        self.register_buffer("_v00", torch.FloatTensor())
        self.register_buffer("_v01", torch.FloatTensor())
        self.register_buffer("_v10", torch.FloatTensor())
        self.register_buffer("_v11", torch.FloatTensor())
        self.register_buffer("_m00", torch.FloatTensor())
        self.register_buffer("_m01", torch.FloatTensor())
        self.register_buffer("_m10", torch.FloatTensor())
        self.register_buffer("_m11", torch.FloatTensor())
        self.register_buffer("_x", torch.FloatTensor())
        self.register_buffer("_y", torch.FloatTensor())

    def forward(self, v, xq, yq, mask):
        batch_size, channels, height, width = v.size()
        _, channels_mask, _, _ = mask.size()

        if channels_mask != channels:
            mask = mask.repeat(1, int(channels/channels_mask), 1, 1)

        # clamp if wanted
        if self._clamp:
            xq.clamp_(0, width - 1)
            yq.clamp_(0, height - 1)

        # ------------------------------------------------------------------
        # Find neighbors
        #
        # x0 = torch.floor(xq).long(),          x0.clamp_(0, width - 1)
        # x1 = x0 + 1,                          x1.clamp_(0, width - 1)
        # y0 = torch.floor(yq).long(),          y0.clamp_(0, height - 1)
        # y1 = y0 + 1,                          y1.clamp_(0, height - 1)
        #
        # ------------------------------------------------------------------
        self._x0 = torch.floor(xq).long().clamp(0, width - 1)
        self._y0 = torch.floor(yq).long().clamp(0, height - 1)

        self._x1 = torch.add(self._x0, 1).clamp(0, width - 1)
        self._y1 = torch.add(self._y0, 1).clamp(0, height - 1)

        # batch_sub2ind
        self._batch_sub2ind([height, width], self._y0, self._x0, out=self._i00)
        self._batch_sub2ind([height, width], self._y0, self._x1, out=self._i01)
        self._batch_sub2ind([height, width], self._y1, self._x0, out=self._i10)
        self._batch_sub2ind([height, width], self._y1, self._x1, out=self._i11)

        # reshape
        v_flat = _bchw2bhwc(v).contiguous().view(-1, channels)
        torch.index_select(v_flat, dim=0, index=self._i00.view(-1), out=self._v00)
        torch.index_select(v_flat, dim=0, index=self._i01.view(-1), out=self._v01)
        torch.index_select(v_flat, dim=0, index=self._i10.view(-1), out=self._v10)
        torch.index_select(v_flat, dim=0, index=self._i11.view(-1), out=self._v11)

        # reshape
        m_flat = _bchw2bhwc(mask).contiguous().view(-1, channels)
        torch.index_select(m_flat, dim=0, index=self._i00.view(-1), out=self._m00)
        torch.index_select(m_flat, dim=0, index=self._i01.view(-1), out=self._m01)
        torch.index_select(m_flat, dim=0, index=self._i10.view(-1), out=self._m10)
        torch.index_select(m_flat, dim=0, index=self._i11.view(-1), out=self._m11)

        # local_coords
        torch.add(xq, - self._x0.float(), out=self._x)
        torch.add(yq, - self._y0.float(), out=self._y)

        # weights
        w00 = torch.unsqueeze((1.0 - self._y) * (1.0 - self._x), dim=1)
        w01 = torch.unsqueeze((1.0 - self._y) * self._x, dim=1)
        w10 = torch.unsqueeze(self._y * (1.0 - self._x), dim=1)
        w11 = torch.unsqueeze(self._y * self._x, dim=1)

        def _reshape(u):
            return _bhwc2bchw(u.view(batch_size, height, width, channels))

        # values
        values = _reshape(self._m00) * _reshape(self._v00) * w00 + _reshape(self._m01) * _reshape(
            self._v01) * w01 + _reshape(self._m10) * _reshape(self._v10) * w10 + _reshape(self._m11) * _reshape(
            self._v11) * w11
        m_weights = _reshape(self._m00) * w00 + _reshape(self._m01) * w01 + _reshape(self._m10) * w10 + _reshape(
            self._m11) * w11
        values = values / (m_weights + 1e-12)
        invalid_mask = (((1 - m_weights) / (m_weights + 1e-12)) > 0.5)[:, 0:1, :, :]

        if self._clamp:
            return values
        else:
            #  find_invalid
            invalid = ((xq < 0) | (xq >= width) | (yq < 0) | (yq >= height) | invalid_mask.squeeze(dim=1)).unsqueeze(dim=1).float()
            transformed = invalid * torch.zeros_like(values) + (1.0 - invalid) * values

        return transformed, (1 - invalid_mask).float()


def resize2D(inputs, size_targets, mode="bilinear"):
    size_inputs = [inputs.size(2), inputs.size(3)]

    if all([size_inputs == size_targets]):
        return inputs  # nothing to do
    elif any([size_targets < size_inputs]):
        resized = tf.adaptive_avg_pool2d(inputs, size_targets)  # downscaling
    else:
        resized = tf.upsample(inputs, size=size_targets, mode=mode)  # upsampling

    # correct scaling
    return resized


def resize2D_as(inputs, output_as, mode="bilinear"):
    size_targets = [output_as.size(2), output_as.size(3)]
    return resize2D(inputs, size_targets, mode=mode)