Repository: googleinterns/deep-stabilization
Branch: master
Commit: 7159c09d21ae
Files: 65
Total size: 42.8 MB

Directory structure:
gitextract__lkvtuhi/

├── .gitignore
├── LICENSE
├── README.md
├── docs/
│   ├── code-of-conduct.md
│   └── contributing.md
└── dvs/
    ├── checkpoint/
    │   └── stabilzation/
    │       └── stabilzation_last.checkpoint
    ├── conf/
    │   ├── stabilzation.yaml
    │   └── stabilzation_train.yaml
    ├── dataset.py
    ├── flownet2/
    │   ├── LICENSE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── convert.py
    │   ├── datasets.py
    │   ├── install.sh
    │   ├── losses.py
    │   ├── main.py
    │   ├── models.py
    │   ├── networks/
    │   │   ├── FlowNetC.py
    │   │   ├── FlowNetFusion.py
    │   │   ├── FlowNetS.py
    │   │   ├── FlowNetSD.py
    │   │   ├── __init__.py
    │   │   ├── channelnorm_package/
    │   │   │   ├── __init__.py
    │   │   │   ├── channelnorm.py
    │   │   │   ├── channelnorm_cuda.cc
    │   │   │   ├── channelnorm_kernel.cu
    │   │   │   ├── channelnorm_kernel.cuh
    │   │   │   └── setup.py
    │   │   ├── correlation_package/
    │   │   │   ├── __init__.py
    │   │   │   ├── correlation.py
    │   │   │   ├── correlation_cuda.cc
    │   │   │   ├── correlation_cuda_kernel.cu
    │   │   │   ├── correlation_cuda_kernel.cuh
    │   │   │   └── setup.py
    │   │   ├── resample2d_package/
    │   │   │   ├── __init__.py
    │   │   │   ├── resample2d.py
    │   │   │   ├── resample2d_cuda.cc
    │   │   │   ├── resample2d_kernel.cu
    │   │   │   ├── resample2d_kernel.cuh
    │   │   │   └── setup.py
    │   │   └── submodules.py
    │   ├── run.sh
    │   ├── run_release.sh
    │   └── utils/
    │       ├── __init__.py
    │       ├── flow_utils.py
    │       ├── frame_utils.py
    │       ├── param_utils.py
    │       └── tools.py
    ├── gyro/
    │   ├── __init__.py
    │   ├── gyro_function.py
    │   └── gyro_io.py
    ├── inference.py
    ├── load_frame_sensor_data.py
    ├── loss.py
    ├── metrics.py
    ├── model.py
    ├── printer.py
    ├── requirements.txt
    ├── train.py
    ├── util.py
    └── warp/
        ├── __init__.py
        ├── rasterizer.py
        ├── read_write.py
        └── warping.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.pyc
.torch
_ext
*.o
_ext/
*.png
*.jpg
*.tar
log/*


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Deep Online Fused Video Stabilization

[[Paper]](https://openaccess.thecvf.com/content/WACV2022/papers/Shi_Deep_Online_Fused_Video_Stabilization_WACV_2022_paper.pdf)[[Supplementary]](https://zhmeishi.github.io/dvs/paper/dvs_supp.pdf)  [[Project Page]](https://zhmeishi.github.io/dvs/) [[Dataset]](https://storage.googleapis.com/dataset_release/all.zip) [[Our Result]](https://storage.googleapis.com/dataset_release/inference_result_release.zip) [[More Results]](https://zhmeishi.github.io/dvs/supp/results.html) 

This repository contains the Pytorch implementation of our method in the paper "Deep Online Fused Video Stabilization".

## Environment Setting
Python version >= 3.6
Pytorch with CUDA >= 1.0.0 (guide is [here](https://pytorch.org/get-started/locally/))
Install other used packages:
```
cd dvs
pip install -r requirements.txt --ignore-installed
```

## Data Preparation
Download sample video [here](https://drive.google.com/file/d/1PpF3-6BbQKy9fldjIfwa5AlbtQflx3sG/view?usp=sharing).
Uncompress the *video* folder under the *dvs* folder.
```
python load_frame_sensor_data.py 
```
Demo of curve visualization:
The **gyro/OIS curve visualization** can be found at *dvs/video/s_114_outdoor_running_trail_daytime/ControlCam_20200930_104820_real.jpg*.


## FlowNet2 Preparation
Note, we provide optical flow result of one test video in our Data Preparation. If you would like to generate them for all test videos, please follow [FlowNet2 official website](https://github.com/NVIDIA/flownet2-pytorch) and guide below. Otherwise, you can skip this section. 

Note, FlowNet2 installation is tricky. Please use Python=3.6 and Pytorch=1.0.0. More details are [here](https://github.com/NVIDIA/flownet2-pytorch/issues/156) or contact us for any questions.

Download FlowNet2 model *FlowNet2_checkpoint.pth.tar* [here](https://drive.google.com/file/d/1hF8vS6YeHkx3j2pfCeQqqZGwA_PJq_Da/view).  Move it under folder *dvs/flownet2*.
```
python warp/read_write.py # video2frames
cd flownet2
bash install.sh # install package
bash run.sh # generate optical flow file for dataset
``` 

## Running Inference 
```
python inference.py
python metrics.py
``` 
The loss and metric information will be printed in the terminal. The metric numbers can be slightly different due to difference on opencv/pytorch versions.  

The result is under *dvs/test/stabilzation*.   
In *s_114_outdoor_running_trail_daytime.jpg*, the blue curve is the output of our models, and the green curve is the input.   
*s_114_outdoor_running_trail_daytime_stab.mp4* is uncropped stabilized video.  
*s_114_outdoor_running_trail_daytime_stab_crop.mp4* is cropped stabilized video. Note, the cropped video is generated after running the metrics code.   

## Training
Download dataset for training and test [here](https://storage.googleapis.com/dataset_release/all.zip). 
Uncompress *all.zip* and move *dataset_release* folder under the *dvs* folder.

Follow FlowNet2 Preparation Section.
```
python warp/read_write.py --dir_path ./dataset_release # video2frames
cd flownet2
bash run_release.sh # generate optical flow file for dataset
``` 

Run training code.
```
python train.py
``` 
The model is saved in *checkpoint/stabilzation_train*.

## Citation 
If you use this code or dataset for your research, please cite our paper.
```
@inproceedings{shi2022deep,
  title={Deep Online Fused Video Stabilization},
  author={Shi, Zhenmei and Shi, Fuhao and Lai, Wei-Sheng and Liang, Chia-Kai and Liang, Yingyu},
  booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
  pages={1250--1258},
  year={2022}
}
```


================================================
FILE: docs/code-of-conduct.md
================================================
# Google Open Source Community Guidelines

At Google, we recognize and celebrate the creativity and collaboration of open
source contributors and the diversity of skills, experiences, cultures, and
opinions they bring to the projects and communities they participate in.

Every one of Google's open source projects and communities are inclusive
environments, based on treating all individuals respectfully, regardless of
gender identity and expression, sexual orientation, disabilities,
neurodiversity, physical appearance, body size, ethnicity, nationality, race,
age, religion, or similar personal characteristic.

We value diverse opinions, but we value respectful behavior more.

Respectful behavior includes:

* Being considerate, kind, constructive, and helpful.
* Not engaging in demeaning, discriminatory, harassing, hateful, sexualized, or
  physically threatening behavior, speech, and imagery.
* Not engaging in unwanted physical contact.

Some Google open source projects [may adopt][] an explicit project code of
conduct, which may have additional detailed expectations for participants. Most
of those projects will use our [modified Contributor Covenant][].

[may adopt]: https://opensource.google/docs/releasing/preparing/#conduct
[modified Contributor Covenant]: https://opensource.google/docs/releasing/template/CODE_OF_CONDUCT/

## Resolve peacefully

We do not believe that all conflict is necessarily bad; healthy debate and
disagreement often yields positive results. However, it is never okay to be
disrespectful.

If you see someone behaving disrespectfully, you are encouraged to address the
behavior directly with those involved. Many issues can be resolved quickly and
easily, and this gives people more control over the outcome of their dispute.
If you are unable to resolve the matter for any reason, or if the behavior is
threatening or harassing, report it. We are dedicated to providing an
environment where participants feel welcome and safe.

## Reporting problems

Some Google open source projects may adopt a project-specific code of conduct.
In those cases, a Google employee will be identified as the Project Steward,
who will receive and handle reports of code of conduct violations. In the event
that a project hasn’t identified a Project Steward, you can report problems by
emailing opensource@google.com.

We will investigate every complaint, but you may not receive a direct response.
We will use our discretion in determining when and how to follow up on reported
incidents, which may range from not taking action to permanent expulsion from
the project and project-sponsored spaces. We will notify the accused of the
report and provide them an opportunity to discuss it before any action is
taken. The identity of the reporter will be omitted from the details of the
report supplied to the accused. In potentially harmful situations, such as
ongoing harassment or threats to anyone's safety, we may take action without
notice.

*This document was adapted from the [IndieWeb Code of Conduct][] and can also
be found at <https://opensource.google/conduct/>.*

[IndieWeb Code of Conduct]: https://indieweb.org/code-of-conduct


================================================
FILE: docs/contributing.md
================================================
# How to Contribute

We'd love to accept your patches and contributions to this project. There are
just a few small guidelines you need to follow.

## Contributor License Agreement

Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution;
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.

You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.

## Code reviews

All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.

## Community Guidelines

This project follows [Google's Open Source Community
Guidelines](https://opensource.google/conduct/).


================================================
FILE: dvs/checkpoint/stabilzation/stabilzation_last.checkpoint
================================================
[File too large to display: 42.5 MB]

================================================
FILE: dvs/conf/stabilzation.yaml
================================================
data:
  exp: 'stabilzation'
  checkpoints_dir: './checkpoint'
  log: './log'
  data_dir: './video'           
  use_cuda: true
  batch_size: 16
  resize_ratio: 0.25
  number_real: 10
  number_virtual: 2
  time_train: 2000  # ms
  sample_freq: 40   # ms
  channel_size: 1
  num_workers: 16                    # num_workers for data_loader
model:
  load_model:  null
  cnn:
    activate_function: relu         # sigmoid, relu, tanh, quadratic
    batch_norm: true
    gap: false
    layers:
  rnn:
    layers:  
    - - 512                        
      - true  
    - - 512                        
      - true    
  fc:
    activate_function: relu
    batch_norm: false               # (batch_norm and drop_out) is False
    layers:  
    - - 256                        
      - true  
    - - 4                         # last layer should be equal to nr_class
      - true
    drop_out: 0
train:
  optimizer: "adam"                  # adam or sgd
  momentum: 0.9                     # for sgd
  decay_epoch: null
  epoch: 400
  snapshot: 2
  init_lr: 0.0001
  lr_decay: 0.5
  lr_step: 200                       # if > 0 decay_epoch should be null
  seed: 1
  weight_decay: 0.0001
  clip_norm: False
  init: "xavier_uniform"            # xavier_uniform or xavier_normal
loss:
  follow: 10
  angle: 1
  smooth: 10 #10
  c2_smooth: 200 #20
  undefine: 2.0
  opt: 0.1
  stay: 0

================================================
FILE: dvs/conf/stabilzation_train.yaml
================================================
data:
  exp: 'stabilzation_train'
  checkpoints_dir: './checkpoint'
  log: './log'
  data_dir: './dataset_release'           
  use_cuda: true
  batch_size: 16
  resize_ratio: 0.25
  number_real: 10
  number_virtual: 2
  time_train: 2000  # ms
  sample_freq: 40   # ms
  channel_size: 1
  num_workers: 16                    # num_workers for data_loader
model:
  load_model:  null
  cnn:
    activate_function: relu         # sigmoid, relu, tanh, quadratic
    batch_norm: true
    gap: false
    layers:
  rnn:
    layers:  
    - - 512                        
      - true  
    - - 512                        
      - true    
  fc:
    activate_function: relu
    batch_norm: false               # (batch_norm and drop_out) is False
    layers:  
    - - 256                        
      - true  
    - - 4                         # last layer should be equal to nr_class
      - true
    drop_out: 0
train:
  optimizer: "adam"                  # adam or sgd
  momentum: 0.9                     # for sgd
  decay_epoch: null
  epoch: 400
  snapshot: 2
  init_lr: 0.0001
  lr_decay: 0.5
  lr_step: 200                       # if > 0 decay_epoch should be null
  seed: 1
  weight_decay: 0.0001
  clip_norm: False
  init: "xavier_uniform"            # xavier_uniform or xavier_normal
loss:
  follow: 10
  angle: 1
  smooth: 10 #10
  c2_smooth: 200 #20
  undefine: 2.0
  opt: 0.1
  stay: 0

================================================
FILE: dvs/dataset.py
================================================
from torch.utils.data import Dataset
import os
import collections
from gyro import (
    LoadGyroData, 
    LoadOISData, 
    LoadFrameData, 
    GetGyroAtTimeStamp, 
    get_static, 
    GetMetadata, 
    GetProjections, 
    train_GetGyroAtTimeStamp,
    QuaternionProduct,
    QuaternionReciprocal,
    FindOISAtTimeStamp,
    norm_quat
    )
import random
import numpy as np
import torchvision.transforms as transforms
import torch
from flownet2 import flow_utils
from scipy import ndimage, misc
from numpy import linalg as LA

def get_data_loader(cf, no_flo = False):
    size = cf["data"]["batch_size"]
    num_workers = cf["data"]["num_workers"]
    train_data, test_data = get_dataset(cf, no_flo)
    trainloader = torch.utils.data.DataLoader(train_data, batch_size=size,shuffle=True, pin_memory=True, num_workers=num_workers)
    testloader = torch.utils.data.DataLoader(test_data, batch_size=size,shuffle=False, pin_memory=True, num_workers=num_workers)
    return trainloader,testloader

def get_dataset(cf, no_flo = False):
    resize_ratio = cf["data"]["resize_ratio"]
    train_transform, test_transform = _data_transforms()
    train_path = os.path.join(cf["data"]["data_dir"], "training")
    test_path = os.path.join(cf["data"]["data_dir"], "test")
    if not os.path.exists(train_path):
        train_path = cf["data"]["data_dir"]
    if not os.path.exists(test_path):
        test_path = cf["data"]["data_dir"]

    train_data = Dataset_Gyro(
        train_path, sample_freq = cf["data"]["sample_freq"]*1000000, number_real = cf["data"]["number_real"], 
        time_train = cf["data"]["time_train"]*1000000, transform = train_transform, resize_ratio = resize_ratio, no_flo = no_flo)
    test_data = Dataset_Gyro(
        test_path, sample_freq = cf["data"]["sample_freq"]*1000000, number_real = cf["data"]["number_real"], 
        time_train = cf["data"]["time_train"]*1000000, transform = test_transform, resize_ratio = resize_ratio, no_flo = no_flo)
    return train_data, test_data

def get_inference_data_loader(cf, data_path, no_flo = False):
    test_data = get_inference_dataset(cf, data_path, no_flo)
    testloader = torch.utils.data.DataLoader(test_data, batch_size=1,shuffle=False, pin_memory=True, num_workers=1)
    return testloader

def get_inference_dataset(cf, data_path, no_flo = False):
    resize_ratio = cf["data"]["resize_ratio"]
    _, test_transform = _data_transforms()
    test_data = Dataset_Gyro(
        data_path, sample_freq = cf["data"]["sample_freq"]*1000000, number_real = cf["data"]["number_real"], 
        time_train = cf["data"]["time_train"]*1000000, transform = test_transform, resize_ratio = resize_ratio,
        inference_only = True, no_flo = no_flo)
    return test_data

def _data_transforms():

    test_transform = transforms.Compose(
        [transforms.ToTensor(),
        ])
    train_transform = transforms.Compose(
        [transforms.ToTensor(),
        ])

    return train_transform, test_transform

class DVS_data():
    def __init__(self):
        self.gyro = None
        self.ois = None
        self.frame = None
        self.length = 0
        self.flo_path = None
        self.flo_shape = None
        self.flo_back_path = None

class Dataset_Gyro(Dataset):
    def __init__(self, path, sample_freq = 33*1000000, number_real = 10, time_train = 2000*1000000, \
        transform = None, inference_only = False, no_flo = False, resize_ratio = 1): 
        r"""
        Arguments:
            sample_freq: real quaternions [t-sample_freq*number_real, t+sample_freq*number_real] ns
            number_real: real gyro num in half time_interval
            time_train: time for a batch ns
        """
        self.sample_freq = sample_freq
        self.number_real = number_real
        self.no_flo = no_flo
        self.resize_ratio = resize_ratio
        self.static_options = get_static()
        self.inference_only = inference_only

        self.ois_ratio = np.array([self.static_options["crop_window_width"] / self.static_options["width"], \
            self.static_options["crop_window_height"] / self.static_options["height"]]) * 0.01
        self.unit_size = 4

        if inference_only:
            self.length = 1
            self.data = [self.process_one_video(path)]
            self.number_train = self.data[0].length  
            return

        self.time_train = time_train
        self.number_train = time_train//self.sample_freq
        
        self.data_name = sorted(os.listdir(path))
        self.length = len(self.data_name)
        self.data = []
        for i in range(self.length):
            self.data.append(self.process_one_video(os.path.join(path,self.data_name[i])))
    
    def process_one_video(self, path):
        dvs_data = DVS_data()
        files = sorted(os.listdir(path))
        print(path)
        for f in files:
            file_path = os.path.join(path,f)
            if "gimbal" in file_path.lower():
                continue
            if "frame" in f and "txt" in f:
                dvs_data.frame = LoadFrameData(file_path)
                print("frame:", dvs_data.frame.shape, end="    ")
            elif "gyro" in f:
                dvs_data.gyro = LoadGyroData(file_path)
                dvs_data.gyro = preprocess_gyro(dvs_data.gyro) 
                print("gyro:", dvs_data.gyro.shape, end="    ")
            elif "ois" in f and "txt" in f:
                dvs_data.ois = LoadOISData(file_path)
                print("ois:", dvs_data.ois.shape, end="    ")
            elif f == "flo":
                dvs_data.flo_path, dvs_data.flo_shape = LoadFlow(file_path)
                print("flo_path:", len(dvs_data.flo_path), end="    ")
                print("flo_shape:", dvs_data.flo_shape, end="    ")
            elif f == "flo_back":
                dvs_data.flo_back_path, _ = LoadFlow(file_path)
            
        print()
        if dvs_data.flo_path is not None:
            dvs_data.length = min(dvs_data.frame.shape[0] - 1, len(dvs_data.flo_path))
        else:
            dvs_data.length = dvs_data.frame.shape[0] - 1 
        return dvs_data

    def generate_quaternions(self, dvs_data):
        first_id = random.randint(0, dvs_data.length - self.number_train) + 1 # skip the first frame

        sample_data = np.zeros((self.number_train, 2 * self.number_real + 1, self.unit_size), dtype=np.float32)
        sample_ois = np.zeros((self.number_train, 2), dtype=np.float32)

        sample_time = np.zeros((self.number_train+1), dtype=np.float32)
        sample_time[0] = get_timestamp(dvs_data.frame, first_id - 1)

        real_postion = np.zeros((self.number_train, 4), dtype=np.float32)

        time_start = sample_time[0]

        for i in range(self.number_train):
            sample_time[i+1] = get_timestamp(dvs_data.frame, first_id + i)
            real_postion[i] = GetGyroAtTimeStamp(dvs_data.gyro, sample_time[i+1] - self.sample_freq)
            sample_ois[i] = self.get_ois_at_timestamp(dvs_data.ois, sample_time[i+1])
            for j in range(-self.number_real, self.number_real+1):
                index = j + self.number_real
                time_stamp = sample_time[i+1] + self.sample_freq * j 
                sample_data[i, index] = self.get_data_at_timestamp(dvs_data.gyro, dvs_data.ois, time_stamp, real_postion[i])
                
        sample_data = np.reshape(sample_data, (self.number_train, (2*self.number_real+1) * self.unit_size))
        return sample_data, sample_time, first_id, real_postion, sample_ois

    def load_flo(self, idx, first_id):
        shape = self.data[idx].flo_shape
        h, w = shape[0], shape[1]
        flo = np.zeros((self.number_train, h, w, 2))
        flo_back = np.zeros((self.number_train, h, w, 2))

        for i in range(self.number_train):
            frame_id = i + first_id
            f = flow_utils.readFlow(self.data[idx].flo_path[frame_id-1]).astype(np.float32) 
            flo[i] = f

            f_b = flow_utils.readFlow(self.data[idx].flo_back_path[frame_id-1]).astype(np.float32) 
            flo_back[i] = f_b

        return flo, flo_back

    def load_real_projections(self, idx, first_id):
        real_projections = np.zeros((self.number_train + 1, self.static_options["num_grid_rows"], 3, 3))
        for i in range(self.number_train + 1):
            frame_id = i + first_id
            metadata = GetMetadata(self.data[idx].frame, frame_id - 1)
            real_projections[i] = np.array(GetProjections(self.static_options, metadata, self.data[idx].gyro, np.zeros(self.data[idx].ois.shape), no_shutter = True))
        return real_projections

    def __getitem__(self, idx):
        inputs, times, first_id, real_postion, ois = self.generate_quaternions(self.data[idx]) 
        real_projections = self.load_real_projections(idx, first_id)
        if self.no_flo:
            flo, flo_back = 0, 0
        else:
            flo, flo_back = self.load_flo(idx, first_id)
        return inputs, times, flo, flo_back, real_projections, real_postion, ois, idx

    def __len__(self):
        return self.length

    def get_virtual_data(self, virtual_queue, real_queue_idx, pre_times, cur_times, time_start, batch_size, number_virtual, quat_t_1):
        # virtual_queue: [batch_size, num, 5 (timestamp, quats)]
        # eular angle, 
        # deta R angular velocity [Q't-1, Q't-2] 
        # output virtual angular velocity, x, x*dtime => detaQt
        virtual_data = np.zeros((batch_size, number_virtual, 4), dtype=np.float32)
        vt_1 = np.zeros((batch_size, 4), dtype=np.float32)
        quat_t_1 = quat_t_1.numpy()
        for i in range(batch_size):
            sample_time = cur_times[i]
            for j in range(number_virtual):
                time_stamp = sample_time - self.sample_freq * (number_virtual - j) 
                virtual_data[i, j] = get_virtual_at_timestamp(virtual_queue[i], self.data[real_queue_idx[i]].gyro, time_stamp, time_start[i], quat_t_1[i])
            vt_1[i] = get_virtual_at_timestamp(virtual_queue[i], self.data[real_queue_idx[i]].gyro, pre_times[i], time_start[i], None)
        virtual_data = np.reshape(virtual_data, (batch_size, number_virtual * 4))
        return torch.tensor(virtual_data, dtype=torch.float), torch.tensor(vt_1, dtype=torch.float)

    def update_virtual_queue(self, batch_size, virtual_queue, out, times):
        virtual_data = np.zeros((batch_size, 5))
        virtual_data[:,0] = times
        virtual_data[:, 1:] = out
        virtual_data = np.expand_dims(virtual_data, axis = 1)

        if None in virtual_queue:
            virtual_queue = virtual_data
        else:
            virtual_queue = np.concatenate((virtual_queue, virtual_data), axis = 1)
        return virtual_queue

    def random_init_virtual_queue(self, batch_size, real_postion, times):
        virtual_queue = np.zeros((batch_size, 3, 5))
        virtual_queue[:, 2, 0] = times - 0.1 * self.sample_freq
        virtual_queue[:, 1, 0] = times - 1.1 * self.sample_freq
        virtual_queue[:, 0, 0] = times - 2.1 * self.sample_freq
        for i in range(batch_size):
            quat = np.random.uniform(low=-0.06, high= 0.06, size=4) # transfer to angle # 0.05
            quat[3] = 1
            quat = quat / LA.norm(quat)
            quat = norm_quat(QuaternionProduct(real_postion[i], quat))
            virtual_queue[i, 2, 1:] = quat
            virtual_queue[i, 1, 1:] = quat
            virtual_queue[i, 0, 1:] = quat
        return virtual_queue

    def get_data_at_timestamp(self, gyro_data, ois_data, time_stamp, quat_t_1):
        quat_t = GetGyroAtTimeStamp(gyro_data, time_stamp)
        quat_dif = QuaternionProduct(quat_t, QuaternionReciprocal(quat_t_1))  
        return quat_dif

    def get_ois_at_timestamp(self, ois_data, time_stamp):
        ois_t = FindOISAtTimeStamp(ois_data, time_stamp)
        ois_t = np.array(ois_t) / self.ois_ratio
        return ois_t

def get_timestamp(frame_data, idx):
    sample_time = frame_data[idx, 0]
    metadata = GetMetadata(frame_data, idx)
    timestmap_ns = metadata["timestamp_ns"] + metadata["rs_time_ns"] * 0.5
    return timestmap_ns

def preprocess_gyro(gyro, extend = 200):
    fake_gyro = np.zeros((extend, 5))
    time_start = gyro[0,0]
    for i in range(extend):
        fake_gyro[-i-1, 0] = time_start - (gyro[i+1, 0] - time_start)
        fake_gyro[-i-1, 4] = gyro[i+1, 4]
        fake_gyro[-i-1, 1:4] = -gyro[i+1, 1:4]

    new_gyro = np.concatenate((fake_gyro, gyro), axis = 0)
    return new_gyro

def LoadFlow(path):
    file_names = sorted(os.listdir(path))
    file_path =[]
    for n in file_names:
        file_path.append(os.path.join(path, n))
    return file_path, flow_utils.readFlow(file_path[0]).shape

def get_virtual_at_timestamp(virtual_queue, real_queue, time_stamp, time_start, quat_t_1 = None, sample_freq = None):
    if virtual_queue is None:
        quat_t = GetGyroAtTimeStamp(real_queue, time_stamp)
    else:
        quat_t = train_GetGyroAtTimeStamp(virtual_queue, time_stamp)
        if quat_t is None:
            quat_t = GetGyroAtTimeStamp(real_queue, time_stamp)
            
    if quat_t_1 is None:
        return quat_t
    else:
        quat_dif = QuaternionProduct(quat_t, QuaternionReciprocal(quat_t_1))  
        return quat_dif


================================================
FILE: dvs/flownet2/LICENSE
================================================
Copyright 2017 NVIDIA CORPORATION

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

================================================
FILE: dvs/flownet2/README.md
================================================
# flownet2-pytorch 

Pytorch implementation of [FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks](https://arxiv.org/abs/1612.01925). 

Multiple GPU training is supported, and the code provides examples for training or inference on [MPI-Sintel](http://sintel.is.tue.mpg.de/) clean and final datasets. The same commands can be used for training or inference with other datasets. See below for more detail.

Inference using fp16 (half-precision) is also supported.

For more help, type <br />
    
    python main.py --help

## Network architectures
Below are the different flownet neural network architectures that are provided. <br />
A batchnorm version for each network is also available.

 - **FlowNet2S**
 - **FlowNet2C**
 - **FlowNet2CS**
 - **FlowNet2CSS**
 - **FlowNet2SD**
 - **FlowNet2**

## Custom layers

`FlowNet2` or `FlowNet2C*` achitectures rely on custom layers `Resample2d` or `Correlation`. <br />
A pytorch implementation of these layers with cuda kernels are available at [./networks](./networks). <br />
Note : Currently, half precision kernels are not available for these layers.

## Data Loaders

Dataloaders for FlyingChairs, FlyingThings, ChairsSDHom and ImagesFromFolder are available in [datasets.py](./datasets.py). <br />

## Loss Functions

L1 and L2 losses with multi-scale support are available in [losses.py](./losses.py). <br />

## Installation 

    # get flownet2-pytorch source
    git clone https://github.com/NVIDIA/flownet2-pytorch.git
    cd flownet2-pytorch

    # install custom layers
    bash install.sh
    
### Python requirements 
Currently, the code supports python 3
* numpy 
* PyTorch ( == 0.4.1, for <= 0.4.0 see branch [python36-PyTorch0.4](https://github.com/NVIDIA/flownet2-pytorch/tree/python36-PyTorch0.4))
* scipy 
* scikit-image
* tensorboardX
* colorama, tqdm, setproctitle 

## Converted Caffe Pre-trained Models
We've included caffe pre-trained models. Should you use these pre-trained weights, please adhere to the [license agreements](https://drive.google.com/file/d/1TVv0BnNFh3rpHZvD-easMb9jYrPE2Eqd/view?usp=sharing). 

* [FlowNet2](https://drive.google.com/file/d/1hF8vS6YeHkx3j2pfCeQqqZGwA_PJq_Da/view?usp=sharing)[620MB]
* [FlowNet2-C](https://drive.google.com/file/d/1BFT6b7KgKJC8rA59RmOVAXRM_S7aSfKE/view?usp=sharing)[149MB]
* [FlowNet2-CS](https://drive.google.com/file/d/1iBJ1_o7PloaINpa8m7u_7TsLCX0Dt_jS/view?usp=sharing)[297MB]
* [FlowNet2-CSS](https://drive.google.com/file/d/157zuzVf4YMN6ABAQgZc8rRmR5cgWzSu8/view?usp=sharing)[445MB]
* [FlowNet2-CSS-ft-sd](https://drive.google.com/file/d/1R5xafCIzJCXc8ia4TGfC65irmTNiMg6u/view?usp=sharing)[445MB]
* [FlowNet2-S](https://drive.google.com/file/d/1V61dZjFomwlynwlYklJHC-TLfdFom3Lg/view?usp=sharing)[148MB]
* [FlowNet2-SD](https://drive.google.com/file/d/1QW03eyYG_vD-dT-Mx4wopYvtPu_msTKn/view?usp=sharing)[173MB]
    
## Inference
    # Example on MPISintel Clean   
    python main.py --inference --model FlowNet2 --save_flow --inference_dataset MpiSintelClean \
    --inference_dataset_root /path/to/mpi-sintel/clean/dataset \
    --resume /path/to/checkpoints 
    
## Training and validation

    # Example on MPISintel Final and Clean, with L1Loss on FlowNet2 model
    python main.py --batch_size 8 --model FlowNet2 --loss=L1Loss --optimizer=Adam --optimizer_lr=1e-4 \
    --training_dataset MpiSintelFinal --training_dataset_root /path/to/mpi-sintel/final/dataset  \
    --validation_dataset MpiSintelClean --validation_dataset_root /path/to/mpi-sintel/clean/dataset

    # Example on MPISintel Final and Clean, with MultiScale loss on FlowNet2C model 
    python main.py --batch_size 8 --model FlowNet2C --optimizer=Adam --optimizer_lr=1e-4 --loss=MultiScale --loss_norm=L1 \
    --loss_numScales=5 --loss_startScale=4 --optimizer_lr=1e-4 --crop_size 384 512 \
    --training_dataset FlyingChairs --training_dataset_root /path/to/flying-chairs/dataset  \
    --validation_dataset MpiSintelClean --validation_dataset_root /path/to/mpi-sintel/clean/dataset
    
## Results on MPI-Sintel
[![Predicted flows on MPI-Sintel](./image.png)](https://www.youtube.com/watch?v=HtBmabY8aeU "Predicted flows on MPI-Sintel")

## Reference 
If you find this implementation useful in your work, please acknowledge it appropriately and cite the paper:
````
@InProceedings{IMKDB17,
  author       = "E. Ilg and N. Mayer and T. Saikia and M. Keuper and A. Dosovitskiy and T. Brox",
  title        = "FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks",
  booktitle    = "IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
  month        = "Jul",
  year         = "2017",
  url          = "http://lmb.informatik.uni-freiburg.de//Publications/2017/IMKDB17"
}
````
```
@misc{flownet2-pytorch,
  author = {Fitsum Reda and Robert Pottorff and Jon Barker and Bryan Catanzaro},
  title = {flownet2-pytorch: Pytorch implementation of FlowNet 2.0: Evolution of Optical Flow Estimation with Deep Networks},
  year = {2017},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/NVIDIA/flownet2-pytorch}}
}
```
## Related Optical Flow Work from Nvidia 
Code (in Caffe and Pytorch): [PWC-Net](https://github.com/NVlabs/PWC-Net) <br />
Paper : [PWC-Net: CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume](https://arxiv.org/abs/1709.02371). 

## Acknowledgments
Parts of this code were derived, as noted in the code, from [ClementPinard/FlowNetPytorch](https://github.com/ClementPinard/FlowNetPytorch).


================================================
FILE: dvs/flownet2/__init__.py
================================================
from .utils import flow_utils, tools

================================================
FILE: dvs/flownet2/convert.py
================================================
#!/usr/bin/env python2.7

import caffe
from caffe.proto import caffe_pb2
import sys, os

import torch
import torch.nn as nn

import argparse, tempfile
import numpy as np

parser = argparse.ArgumentParser()
parser.add_argument('caffe_model', help='input model in hdf5 or caffemodel format')
parser.add_argument('prototxt_template',help='prototxt template')
parser.add_argument('flownet2_pytorch', help='path to flownet2-pytorch')

args = parser.parse_args()

args.rgb_max = 255
args.fp16 = False
args.grads = {}

# load models
sys.path.append(args.flownet2_pytorch)

import models
from utils.param_utils import *

width = 256
height = 256
keys = {'TARGET_WIDTH': width, 
        'TARGET_HEIGHT': height,
        'ADAPTED_WIDTH':width,
        'ADAPTED_HEIGHT':height,
        'SCALE_WIDTH':1.,
        'SCALE_HEIGHT':1.,}

template = '\n'.join(np.loadtxt(args.prototxt_template, dtype=str, delimiter='\n'))
for k in keys:
    template = template.replace('$%s$'%(k),str(keys[k]))

prototxt = tempfile.NamedTemporaryFile(mode='w', delete=True)
prototxt.write(template)
prototxt.flush()

net = caffe.Net(prototxt.name, args.caffe_model, caffe.TEST)

weights = {}
biases = {}

for k, v in list(net.params.items()):
    weights[k] = np.array(v[0].data).reshape(v[0].data.shape)
    biases[k] = np.array(v[1].data).reshape(v[1].data.shape)
    print((k, weights[k].shape, biases[k].shape))

if 'FlowNet2/' in args.caffe_model:
    model = models.FlowNet2(args)

    parse_flownetc(model.flownetc.modules(), weights, biases)
    parse_flownets(model.flownets_1.modules(), weights, biases, param_prefix='net2_')
    parse_flownets(model.flownets_2.modules(), weights, biases, param_prefix='net3_')
    parse_flownetsd(model.flownets_d.modules(), weights, biases, param_prefix='netsd_')
    parse_flownetfusion(model.flownetfusion.modules(), weights, biases, param_prefix='fuse_')

    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2_checkpoint.pth.tar'))

elif 'FlowNet2-C/' in args.caffe_model:
    model = models.FlowNet2C(args)

    parse_flownetc(model.modules(), weights, biases)
    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2-C_checkpoint.pth.tar'))

elif 'FlowNet2-CS/' in args.caffe_model:
    model = models.FlowNet2CS(args)

    parse_flownetc(model.flownetc.modules(), weights, biases)
    parse_flownets(model.flownets_1.modules(), weights, biases, param_prefix='net2_')

    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2-CS_checkpoint.pth.tar'))

elif 'FlowNet2-CSS/' in args.caffe_model:
    model = models.FlowNet2CSS(args)

    parse_flownetc(model.flownetc.modules(), weights, biases)
    parse_flownets(model.flownets_1.modules(), weights, biases, param_prefix='net2_')
    parse_flownets(model.flownets_2.modules(), weights, biases, param_prefix='net3_')

    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2-CSS_checkpoint.pth.tar'))

elif 'FlowNet2-CSS-ft-sd/' in args.caffe_model:
    model = models.FlowNet2CSS(args)

    parse_flownetc(model.flownetc.modules(), weights, biases)
    parse_flownets(model.flownets_1.modules(), weights, biases, param_prefix='net2_')
    parse_flownets(model.flownets_2.modules(), weights, biases, param_prefix='net3_')

    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2-CSS-ft-sd_checkpoint.pth.tar'))

elif 'FlowNet2-S/' in args.caffe_model:
    model = models.FlowNet2S(args)

    parse_flownetsonly(model.modules(), weights, biases, param_prefix='')
    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2-S_checkpoint.pth.tar'))

elif 'FlowNet2-SD/' in args.caffe_model:
    model = models.FlowNet2SD(args)

    parse_flownetsd(model.modules(), weights, biases, param_prefix='')

    state = {'epoch': 0,
             'state_dict': model.state_dict(),
             'best_EPE': 1e10}
    torch.save(state, os.path.join(args.flownet2_pytorch, 'FlowNet2-SD_checkpoint.pth.tar'))

else:
    print(('model type cound not be determined from input caffe model %s'%(args.caffe_model)))
    quit()
print(("done converting ", args.caffe_model))

================================================
FILE: dvs/flownet2/datasets.py
================================================
import torch
import torch.utils.data as data

import os, math, random
from os.path import *
import numpy as np

from glob import glob
import utils.frame_utils as frame_utils

from imageio import imread

class StaticRandomCrop(object):
    def __init__(self, image_size, crop_size):
        self.th, self.tw = crop_size
        h, w = image_size
        self.h1 = random.randint(0, h - self.th)
        self.w1 = random.randint(0, w - self.tw)

    def __call__(self, img):
        return img[self.h1:(self.h1+self.th), self.w1:(self.w1+self.tw),:]

class StaticCenterCrop(object):
    def __init__(self, image_size, crop_size):
        self.th, self.tw = crop_size
        self.h, self.w = image_size
    def __call__(self, img):
        return img[(self.h-self.th)//2:(self.h+self.th)//2, (self.w-self.tw)//2:(self.w+self.tw)//2,:]

class Padding(object):
    def __init__(self, image_size, pad_size):
        self.th, self.tw = pad_size
        self.h, self.w = image_size
    def __call__(self, img):
        out = np.zeros((self.th, self.tw, 3))
        out[:self.h, :self.w,:] = img
        return out

class MpiSintel(data.Dataset):
    def __init__(self, args, is_cropped = False, root = '', dstype = 'clean', replicates = 1):
        self.args = args
        self.is_cropped = is_cropped
        self.crop_size = args.crop_size
        self.render_size = args.inference_size
        self.replicates = replicates

        flow_root = join(root, 'flow')
        image_root = join(root, dstype)

        file_list = sorted(glob(join(flow_root, '*/*.flo')))

        self.flow_list = []
        self.image_list = []

        for file in file_list:
            if 'test' in file:
                # print file
                continue

            fbase = file[len(flow_root)+1:]
            fprefix = fbase[:-8]
            fnum = int(fbase[-8:-4])

            img1 = join(image_root, fprefix + "%04d"%(fnum+0) + '.png')
            img2 = join(image_root, fprefix + "%04d"%(fnum+1) + '.png')

            if not isfile(img1) or not isfile(img2) or not isfile(file):
                continue

            self.image_list += [[img1, img2]]
            self.flow_list += [file]

        self.size = len(self.image_list)

        self.frame_size = frame_utils.read_gen(self.image_list[0][0]).shape

        if (self.render_size[0] < 0) or (self.render_size[1] < 0) or (self.frame_size[0]%64) or (self.frame_size[1]%64):
            self.render_size[0] = ( (self.frame_size[0])//64 ) * 64
            self.render_size[1] = ( (self.frame_size[1])//64 ) * 64

        args.inference_size = self.render_size

        assert (len(self.image_list) == len(self.flow_list))

    def __getitem__(self, index):

        index = index % self.size

        img1 = frame_utils.read_gen(self.image_list[index][0])
        img2 = frame_utils.read_gen(self.image_list[index][1])

        flow = frame_utils.read_gen(self.flow_list[index])

        images = [img1, img2]
        image_size = img1.shape[:2]

        if self.is_cropped:
            cropper = StaticRandomCrop(image_size, self.crop_size)
        else:
            cropper = StaticCenterCrop(image_size, self.render_size)
        images = list(map(cropper, images))
        flow = cropper(flow)

        images = np.array(images).transpose(3,0,1,2)
        flow = flow.transpose(2,0,1)

        images = torch.from_numpy(images.astype(np.float32))
        flow = torch.from_numpy(flow.astype(np.float32))

        return [images], [flow]

    def __len__(self):
        return self.size * self.replicates

class MpiSintelClean(MpiSintel):
    def __init__(self, args, is_cropped = False, root = '', replicates = 1):
        super(MpiSintelClean, self).__init__(args, is_cropped = is_cropped, root = root, dstype = 'clean', replicates = replicates)

class MpiSintelFinal(MpiSintel):
    def __init__(self, args, is_cropped = False, root = '', replicates = 1):
        super(MpiSintelFinal, self).__init__(args, is_cropped = is_cropped, root = root, dstype = 'final', replicates = replicates)

class FlyingChairs(data.Dataset):
  def __init__(self, args, is_cropped, root = '/path/to/FlyingChairs_release/data', replicates = 1):
    self.args = args
    self.is_cropped = is_cropped
    self.crop_size = args.crop_size
    self.render_size = args.inference_size
    self.replicates = replicates

    images = sorted( glob( join(root, '*.ppm') ) )

    self.flow_list = sorted( glob( join(root, '*.flo') ) )

    assert (len(images)//2 == len(self.flow_list))

    self.image_list = []
    for i in range(len(self.flow_list)):
        im1 = images[2*i]
        im2 = images[2*i + 1]
        self.image_list += [ [ im1, im2 ] ]

    assert len(self.image_list) == len(self.flow_list)

    self.size = len(self.image_list)

    self.frame_size = frame_utils.read_gen(self.image_list[0][0]).shape

    if (self.render_size[0] < 0) or (self.render_size[1] < 0) or (self.frame_size[0]%64) or (self.frame_size[1]%64):
        self.render_size[0] = ( (self.frame_size[0])//64 ) * 64
        self.render_size[1] = ( (self.frame_size[1])//64 ) * 64

    args.inference_size = self.render_size

  def __getitem__(self, index):
    index = index % self.size

    img1 = frame_utils.read_gen(self.image_list[index][0])
    img2 = frame_utils.read_gen(self.image_list[index][1])

    flow = frame_utils.read_gen(self.flow_list[index])

    images = [img1, img2]
    image_size = img1.shape[:2]
    if self.is_cropped:
        cropper = StaticRandomCrop(image_size, self.crop_size)
    else:
        cropper = StaticCenterCrop(image_size, self.render_size)
    images = list(map(cropper, images))
    flow = cropper(flow)


    images = np.array(images).transpose(3,0,1,2)
    flow = flow.transpose(2,0,1)

    images = torch.from_numpy(images.astype(np.float32))
    flow = torch.from_numpy(flow.astype(np.float32))

    return [images], [flow]

  def __len__(self):
    return self.size * self.replicates

class FlyingThings(data.Dataset):
  def __init__(self, args, is_cropped, root = '/path/to/flyingthings3d', dstype = 'frames_cleanpass', replicates = 1):
    self.args = args
    self.is_cropped = is_cropped
    self.crop_size = args.crop_size
    self.render_size = args.inference_size
    self.replicates = replicates

    image_dirs = sorted(glob(join(root, dstype, 'TRAIN/*/*')))
    image_dirs = sorted([join(f, 'left') for f in image_dirs] + [join(f, 'right') for f in image_dirs])

    flow_dirs = sorted(glob(join(root, 'optical_flow_flo_format/TRAIN/*/*')))
    flow_dirs = sorted([join(f, 'into_future/left') for f in flow_dirs] + [join(f, 'into_future/right') for f in flow_dirs])

    assert (len(image_dirs) == len(flow_dirs))

    self.image_list = []
    self.flow_list = []

    for idir, fdir in zip(image_dirs, flow_dirs):
        images = sorted( glob(join(idir, '*.png')) )
        flows = sorted( glob(join(fdir, '*.flo')) )
        for i in range(len(flows)):
            self.image_list += [ [ images[i], images[i+1] ] ]
            self.flow_list += [flows[i]]

    assert len(self.image_list) == len(self.flow_list)

    self.size = len(self.image_list)

    self.frame_size = frame_utils.read_gen(self.image_list[0][0]).shape

    if (self.render_size[0] < 0) or (self.render_size[1] < 0) or (self.frame_size[0]%64) or (self.frame_size[1]%64):
        self.render_size[0] = ( (self.frame_size[0])//64 ) * 64
        self.render_size[1] = ( (self.frame_size[1])//64 ) * 64

    args.inference_size = self.render_size

  def __getitem__(self, index):
    index = index % self.size

    img1 = frame_utils.read_gen(self.image_list[index][0])
    img2 = frame_utils.read_gen(self.image_list[index][1])

    flow = frame_utils.read_gen(self.flow_list[index])

    images = [img1, img2]
    image_size = img1.shape[:2]
    if self.is_cropped:
        cropper = StaticRandomCrop(image_size, self.crop_size)
    else:
        cropper = StaticCenterCrop(image_size, self.render_size)
    images = list(map(cropper, images))
    flow = cropper(flow)


    images = np.array(images).transpose(3,0,1,2)
    flow = flow.transpose(2,0,1)

    images = torch.from_numpy(images.astype(np.float32))
    flow = torch.from_numpy(flow.astype(np.float32))

    return [images], [flow]

  def __len__(self):
    return self.size * self.replicates

class FlyingThingsClean(FlyingThings):
    def __init__(self, args, is_cropped = False, root = '', replicates = 1):
        super(FlyingThingsClean, self).__init__(args, is_cropped = is_cropped, root = root, dstype = 'frames_cleanpass', replicates = replicates)

class FlyingThingsFinal(FlyingThings):
    def __init__(self, args, is_cropped = False, root = '', replicates = 1):
        super(FlyingThingsFinal, self).__init__(args, is_cropped = is_cropped, root = root, dstype = 'frames_finalpass', replicates = replicates)

class ChairsSDHom(data.Dataset):
  def __init__(self, args, is_cropped, root = '/path/to/chairssdhom/data', dstype = 'train', replicates = 1):
    self.args = args
    self.is_cropped = is_cropped
    self.crop_size = args.crop_size
    self.render_size = args.inference_size
    self.replicates = replicates

    image1 = sorted( glob( join(root, dstype, 't0/*.png') ) )
    image2 = sorted( glob( join(root, dstype, 't1/*.png') ) )
    self.flow_list = sorted( glob( join(root, dstype, 'flow/*.flo') ) )

    assert (len(image1) == len(self.flow_list))

    self.image_list = []
    for i in range(len(self.flow_list)):
        im1 = image1[i]
        im2 = image2[i]
        self.image_list += [ [ im1, im2 ] ]

    assert len(self.image_list) == len(self.flow_list)

    self.size = len(self.image_list)

    self.frame_size = frame_utils.read_gen(self.image_list[0][0]).shape

    if (self.render_size[0] < 0) or (self.render_size[1] < 0) or (self.frame_size[0]%64) or (self.frame_size[1]%64):
        self.render_size[0] = ( (self.frame_size[0])//64 ) * 64
        self.render_size[1] = ( (self.frame_size[1])//64 ) * 64

    args.inference_size = self.render_size

  def __getitem__(self, index):
    index = index % self.size

    img1 = frame_utils.read_gen(self.image_list[index][0])
    img2 = frame_utils.read_gen(self.image_list[index][1])

    flow = frame_utils.read_gen(self.flow_list[index])
    flow = flow[::-1,:,:]

    images = [img1, img2]
    image_size = img1.shape[:2]
    if self.is_cropped:
        cropper = StaticRandomCrop(image_size, self.crop_size)
    else:
        cropper = StaticCenterCrop(image_size, self.render_size)
    images = list(map(cropper, images))
    flow = cropper(flow)


    images = np.array(images).transpose(3,0,1,2)
    flow = flow.transpose(2,0,1)

    images = torch.from_numpy(images.astype(np.float32))
    flow = torch.from_numpy(flow.astype(np.float32))

    return [images], [flow]

  def __len__(self):
    return self.size * self.replicates

class ChairsSDHomTrain(ChairsSDHom):
    def __init__(self, args, is_cropped = False, root = '', replicates = 1):
        super(ChairsSDHomTrain, self).__init__(args, is_cropped = is_cropped, root = root, dstype = 'train', replicates = replicates)

class ChairsSDHomTest(ChairsSDHom):
    def __init__(self, args, is_cropped = False, root = '', replicates = 1):
        super(ChairsSDHomTest, self).__init__(args, is_cropped = is_cropped, root = root, dstype = 'test', replicates = replicates)

class ImagesFromFolder(data.Dataset):
  def __init__(self, args, is_cropped, root = '/path/to/frames/only/folder', iext = 'png', replicates = 1):
    self.args = args
    self.is_cropped = is_cropped
    self.crop_size = args.crop_size
    self.render_size = args.inference_size
    self.replicates = replicates

    images = sorted( glob( join(root, '*.' + iext) ) )
    self.image_list = []
    for i in range(len(images)-1):
        im1 = images[i]
        im2 = images[i+1]
        self.image_list += [ [ im1, im2 ] ]

    self.size = len(self.image_list)

    self.frame_size = frame_utils.read_gen(self.image_list[0][0]).shape

    if (self.render_size[0] < 0) or (self.render_size[1] < 0) or (self.frame_size[0]%64) or (self.frame_size[1]%64):
        self.render_size[0] = ( (self.frame_size[0])//64 ) * 64
        self.render_size[1] = ( (self.frame_size[1])//64 ) * 64

    args.inference_size = self.render_size

  def __getitem__(self, index):
    index = index % self.size

    img1 = frame_utils.read_gen(self.image_list[index][0])
    img2 = frame_utils.read_gen(self.image_list[index][1])

    images = [img1, img2]
    image_size = img1.shape[:2]
    if self.is_cropped:
        cropper = StaticRandomCrop(image_size, self.crop_size)
    else:
        cropper = StaticCenterCrop(image_size, self.render_size)
    images = list(map(cropper, images))
    
    images = np.array(images).transpose(3,0,1,2)
    images = torch.from_numpy(images.astype(np.float32))

    return [images], [torch.zeros(images.size()[0:1] + (2,) + images.size()[-2:])]

  def __len__(self):
    return self.size * self.replicates


class Google(data.Dataset):
    def __init__(self, args, is_cropped = False, root = '', dstype = 'frames', replicates = 1):
        self.args = args
        self.is_cropped = is_cropped
        self.crop_size = args.crop_size
        self.render_size = args.inference_size
        self.replicates = replicates

        image_root = join(root, dstype)

        file_list = sorted(glob(join(image_root, '*.png')))

        self.image_list = []

        for i in range(len(file_list)-1):

            img1 = join(file_list[i])
            img2 = join(file_list[i+1])

            if not isfile(img1) or not isfile(img2):
                continue

            self.image_list += [[img1, img2]]

        self.size = len(self.image_list)

        self.frame_size = frame_utils.read_gen(self.image_list[0][0]).shape

        if (self.render_size[0] < 0) or (self.render_size[1] < 0) or (self.frame_size[0]%64) or (self.frame_size[1]%64):
            self.render_size[0] = ( math.ceil(self.frame_size[0]/64) ) * 64
            self.render_size[1] = ( math.ceil(self.frame_size[1]/64) ) * 64


        args.inference_size = self.render_size

    def __getitem__(self, index):

        index = index % self.size

        img1 = frame_utils.read_gen(self.image_list[index][0])
        img2 = frame_utils.read_gen(self.image_list[index][1])

        images = [img1, img2]
        image_size = img1.shape[:2]

        if self.is_cropped:
            cropper = StaticRandomCrop(image_size, self.crop_size)
        else:
            cropper = Padding(image_size, self.render_size)
        images = list(map(cropper, images))

        images = np.array(images).transpose(3,0,1,2)

        images = torch.from_numpy(images.astype(np.float32))

        return [images]

    def __len__(self):
        return self.size * self.replicates

'''
import argparse
import sys, os
import importlib
from scipy.misc import imsave
import numpy as np

import datasets
reload(datasets)

parser = argparse.ArgumentParser()
args = parser.parse_args()
args.inference_size = [1080, 1920]
args.crop_size = [384, 512]
args.effective_batch_size = 1

index = 500
v_dataset = datasets.MpiSintelClean(args, True, root='../MPI-Sintel/flow/training')
a, b = v_dataset[index]
im1 = a[0].numpy()[:,0,:,:].transpose(1,2,0)
im2 = a[0].numpy()[:,1,:,:].transpose(1,2,0)
imsave('./img1.png', im1)
imsave('./img2.png', im2)
flow_utils.writeFlow('./flow.flo', b[0].numpy().transpose(1,2,0))

'''


================================================
FILE: dvs/flownet2/install.sh
================================================
#!/bin/bash
cd ./networks/correlation_package
rm -rf *_cuda.egg-info build dist __pycache__
python3 setup.py install --user

cd ../resample2d_package
rm -rf *_cuda.egg-info build dist __pycache__
python3 setup.py install --user

cd ../channelnorm_package
rm -rf *_cuda.egg-info build dist __pycache__
python3 setup.py install --user

cd ..


================================================
FILE: dvs/flownet2/losses.py
================================================
'''
Portions of this code copyright 2017, Clement Pinard
'''

# freda (todo) : adversarial loss 

import torch
import torch.nn as nn
import math

def EPE(input_flow, target_flow):
    return torch.norm(target_flow-input_flow,p=2,dim=1).mean()

class L1(nn.Module):
    def __init__(self):
        super(L1, self).__init__()
    def forward(self, output, target):
        lossvalue = torch.abs(output - target).mean()
        return lossvalue

class L2(nn.Module):
    def __init__(self):
        super(L2, self).__init__()
    def forward(self, output, target):
        lossvalue = torch.norm(output-target,p=2,dim=1).mean()
        return lossvalue

class L1Loss(nn.Module):
    def __init__(self, args):
        super(L1Loss, self).__init__()
        self.args = args
        self.loss = L1()
        self.loss_labels = ['L1', 'EPE']

    def forward(self, output, target):
        lossvalue = self.loss(output, target)
        epevalue = EPE(output, target)
        return [lossvalue, epevalue]

class L2Loss(nn.Module):
    def __init__(self, args):
        super(L2Loss, self).__init__()
        self.args = args
        self.loss = L2()
        self.loss_labels = ['L2', 'EPE']

    def forward(self, output, target):
        lossvalue = self.loss(output, target)
        epevalue = EPE(output, target)
        return [lossvalue, epevalue]

class MultiScale(nn.Module):
    def __init__(self, args, startScale = 4, numScales = 5, l_weight= 0.32, norm= 'L1'):
        super(MultiScale,self).__init__()

        self.startScale = startScale
        self.numScales = numScales
        self.loss_weights = torch.FloatTensor([(l_weight / 2 ** scale) for scale in range(self.numScales)])
        self.args = args
        self.l_type = norm
        self.div_flow = 0.05
        assert(len(self.loss_weights) == self.numScales)

        if self.l_type == 'L1':
            self.loss = L1()
        else:
            self.loss = L2()

        self.multiScales = [nn.AvgPool2d(self.startScale * (2**scale), self.startScale * (2**scale)) for scale in range(self.numScales)]
        self.loss_labels = ['MultiScale-'+self.l_type, 'EPE'],

    def forward(self, output, target):
        lossvalue = 0
        epevalue = 0

        if type(output) is tuple:
            target = self.div_flow * target
            for i, output_ in enumerate(output):
                target_ = self.multiScales[i](target)
                epevalue += self.loss_weights[i]*EPE(output_, target_)
                lossvalue += self.loss_weights[i]*self.loss(output_, target_)
            return [lossvalue, epevalue]
        else:
            epevalue += EPE(output, target)
            lossvalue += self.loss(output, target)
            return  [lossvalue, epevalue]


================================================
FILE: dvs/flownet2/main.py
================================================
#!/usr/bin/env python
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.autograd import Variable
from tensorboardX import SummaryWriter

import argparse, os, sys, subprocess
import colorama
import numpy as np
from tqdm import tqdm
from glob import glob
from os.path import *

import models, datasets
from utils import flow_utils, tools
import time

    # Reusable function for inference
def inference(args, epoch, data_path, data_loader, model, offset=0):

    model.eval()
    
    if args.save_flow or args.render_validation:
        flow_folder = "{}/flo".format(data_path)
        flow_back_folder = "{}/flo_back".format(data_path)
        if not os.path.exists(flow_folder):
            os.makedirs(flow_folder)
        if not os.path.exists(flow_back_folder):
            os.makedirs(flow_back_folder)
    
    # visualization folder
    if args.inference_visualize:
        flow_vis_folder = "{}/flo_vis".format(data_path)
        if not os.path.exists(flow_vis_folder):
            os.makedirs(flow_vis_folder)
        flow_back_vis_folder = "{}/flo_back_vis".format(data_path)
        if not os.path.exists(flow_back_vis_folder):
            os.makedirs(flow_back_vis_folder)
    
    args.inference_n_batches = np.inf if args.inference_n_batches < 0 else args.inference_n_batches

    progress = tqdm(data_loader, ncols=100, total=np.minimum(len(data_loader), args.inference_n_batches), desc='Inferencing ', 
        leave=True, position=offset)

    for batch_idx, (data) in enumerate(progress):
        data = data[0]
        data_back = torch.cat((data[:,:,1:,:,:], data[:,:,:1,:,:]), dim = 2)
        if args.cuda:
            data_forward = data.cuda(non_blocking=True)
            data_back = data_back.cuda(non_blocking=True)
        data_forward = Variable(data_forward)
        data_back = Variable(data_back)

        flo_path = join(flow_folder, '%06d.flo'%(batch_idx))
        flo_back_path = join(flow_back_folder, '%06d.flo'%(batch_idx))
        frame_size = data_loader.dataset.frame_size
        if not os.path.exists(flo_path):
            with torch.no_grad():
                output = model(data_forward)[:,:,:frame_size[0], :frame_size[1]]
            if args.save_flow or args.render_validation:
                _pflow = output[0].data.cpu().numpy().transpose(1, 2, 0)
                flow_utils.writeFlow( flo_path,  _pflow)
                if args.inference_visualize:
                    flow_utils.visulize_flow_file(
                        join(flow_folder, '%06d.flo' % (batch_idx)),flow_vis_folder)

        if not os.path.exists(flo_back_path):
            with torch.no_grad():
                output = model(data_back)[:,:,:frame_size[0], :frame_size[1]]
            if args.save_flow or args.render_validation:
                _pflow = output[0].data.cpu().numpy().transpose(1, 2, 0)
                flow_utils.writeFlow( flo_back_path,  _pflow)
                if args.inference_visualize:
                    flow_utils.visulize_flow_file(
                        join(flow_back_folder, '%06d.flo' % (batch_idx)), flow_back_vis_folder)
                
        progress.update(1)

        if batch_idx == (args.inference_n_batches - 1):
            break
    progress.close()
    return

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--fp16', action='store_true', help='Run model in pseudo-fp16 mode (fp16 storage fp32 math).')
    parser.add_argument('--fp16_scale', type=float, default=1024., help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    parser.add_argument('--start_epoch', type=int, default=1)
    parser.add_argument('--batch_size', '-b', type=int, default=8, help="Batch size")
    parser.add_argument('--crop_size', type=int, nargs='+', default = [256, 256], help="Spatial dimension to crop training samples for training")
    parser.add_argument("--rgb_max", type=float, default = 255.)

    parser.add_argument('--number_workers', '-nw', '--num_workers', type=int, default=8)
    parser.add_argument('--number_gpus', '-ng', type=int, default=-1, help='number of GPUs to use')
    parser.add_argument('--no_cuda', action='store_true')

    parser.add_argument('--save', '-s', default='./Google', type=str, help='directory for saving')

    parser.add_argument('--inference', action='store_true')
    parser.add_argument('--inference_visualize', action='store_true',
                        help="visualize the optical flow during inference")
    parser.add_argument('--inference_size', type=int, nargs='+', default = [-1,-1], help='spatial size divisible by 64. default (-1,-1) - largest possible valid size would be used')
    parser.add_argument('--inference_batch_size', type=int, default=1)
    parser.add_argument('--inference_n_batches', type=int, default=-1)
    parser.add_argument('--save_flow', action='store_true', help='save predicted flows to file')

    parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
    parser.add_argument('--log_frequency', '--summ_iter', type=int, default=1, help="Log every n batches")

    tools.add_arguments_for_module(parser, models, argument_for_class='model', default='FlowNet2')
    
    tools.add_arguments_for_module(parser, datasets, argument_for_class='inference_dataset', default='Google', 
                                    skip_params=['is_cropped'],
                                    parameter_defaults={'root': './Google/train',
                                                        'replicates': 1})

    main_dir = os.path.dirname(os.path.realpath(__file__))
    os.chdir(main_dir)

    # Parse the official arguments
    with tools.TimerBlock("Parsing Arguments") as block:
        args = parser.parse_args()
        if args.number_gpus < 0 : args.number_gpus = torch.cuda.device_count()

        # Get argument defaults (hastag #thisisahack)
        parser.add_argument('--IGNORE',  action='store_true')
        defaults = vars(parser.parse_args(['--IGNORE']))

        # Print all arguments, color the non-defaults
        for argument, value in sorted(vars(args).items()):
            reset = colorama.Style.RESET_ALL
            color = reset if value == defaults[argument] else colorama.Fore.MAGENTA
            block.log('{}{}: {}{}'.format(color, argument, value, reset))

        args.model_class = tools.module_to_dict(models)[args.model]

        args.inference_dataset_class = tools.module_to_dict(datasets)[args.inference_dataset]

        args.cuda = not args.no_cuda and torch.cuda.is_available()
        # args.current_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).rstrip()
        args.log_file = join(args.save, 'args.txt')

        # dict to collect activation gradients (for training debug purpose)
        args.grads = {}

        args.total_epochs = 1
        args.inference_dir = "{}/inference".format(args.save)

    print('Source Code')
    # print(('  Current Git Hash: {}\n'.format(args.current_hash)))

    # Dynamically load the dataset class with parameters passed in via "--argument_[param]=[value]" arguments
    with tools.TimerBlock("Initializing Datasets") as block:
        args.effective_batch_size = args.batch_size * args.number_gpus
        args.effective_inference_batch_size = args.inference_batch_size * args.number_gpus
        args.effective_number_workers = args.number_workers * args.number_gpus
        gpuargs = {'num_workers': args.effective_number_workers, 
                   'pin_memory': True, 
                   'drop_last' : True} if args.cuda else {}
        inf_gpuargs = gpuargs.copy()
        inf_gpuargs['num_workers'] = args.number_workers

        block.log('Inference Dataset: {}'.format(args.inference_dataset))

        dataset_root = args.inference_dataset_root 
        data_name = sorted(os.listdir(dataset_root))

        block.log(data_name)
        inference_loaders = {}
        for i in range(len(data_name)):
            dataset_path = os.path.join(dataset_root, data_name[i])
            args.inference_dataset_root  = dataset_path
            inference_dataset = args.inference_dataset_class(args, False, **tools.kwargs_from_args(args, 'inference_dataset'))
            inference_loaders[dataset_path] = DataLoader(inference_dataset, batch_size=args.effective_inference_batch_size, shuffle=False, **inf_gpuargs)
            block.log('Inference Input: {}'.format(' '.join([str([d for d in x.size()]) for x in inference_dataset[0][0]])))

    # Dynamically load model and loss class with parameters passed in via "--model_[param]=[value]" or "--loss_[param]=[value]" arguments
    with tools.TimerBlock("Building {} model".format(args.model)) as block:
        class Model(nn.Module):
            def __init__(self, args):
                super(Model, self).__init__()
                kwargs = tools.kwargs_from_args(args, 'model')
                self.model = args.model_class(args, **kwargs)
                
            def forward(self, data):
                output = self.model(data)
                return output

        model = Model(args)

        block.log('Effective Batch Size: {}'.format(args.effective_batch_size))
        block.log('Number of parameters: {}'.format(sum([p.data.nelement() if p.requires_grad else 0 for p in model.parameters()])))

        if args.cuda and args.number_gpus > 0:
            block.log('Initializing CUDA')
            model = model.cuda()
            block.log('Parallelizing')
            model = nn.parallel.DataParallel(model, device_ids=list(range(args.number_gpus)))

        # Load weights if needed, otherwise randomly initialize
        if args.resume and os.path.isfile(args.resume):
            block.log("Loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            model.module.model.load_state_dict(checkpoint['state_dict'])
            block.log("Loaded checkpoint '{}' (at epoch {})".format(args.resume, checkpoint['epoch']))

        elif args.resume and args.inference:
            block.log("No checkpoint found at '{}'".format(args.resume))
            quit()

        else:
            block.log("Random initialization")

        block.log("Initializing save directory: {}".format(args.save))
        if not os.path.exists(args.save):
            os.makedirs(args.save)

    # Log all arguments to file
    for argument, value in sorted(vars(args).items()):
        block.log2file(args.log_file, '{}: {}'.format(argument, value))

    for data_path in inference_loaders:
        # Primary epoch loop
        progress = tqdm(list(range(args.start_epoch, args.total_epochs + 1)), miniters=1, ncols=100, desc='Overall Progress', leave=True, position=0)
        offset = 1

        for epoch in progress:
            stats = inference(args=args, epoch=epoch - 1, data_path = data_path, data_loader=inference_loaders[data_path], model=model, offset=offset)
            offset += 1
        print("\n")

================================================
FILE: dvs/flownet2/models.py
================================================
import torch
import torch.nn as nn
from torch.nn import init

import math
import numpy as np

try:
    from networks.resample2d_package.resample2d import Resample2d
    from networks.channelnorm_package.channelnorm import ChannelNorm

    from networks import FlowNetC
    from networks import FlowNetS
    from networks import FlowNetSD
    from networks import FlowNetFusion

    from networks.submodules import *
except:
    from .networks.resample2d_package.resample2d import Resample2d
    from .networks.channelnorm_package.channelnorm import ChannelNorm

    from .networks import FlowNetC
    from .networks import FlowNetS
    from .networks import FlowNetSD
    from .networks import FlowNetFusion

    from .networks.submodules import *
'Parameter count = 162,518,834'

class FlowNet2(nn.Module):

    def __init__(self, args, batchNorm=False, div_flow = 20.):
        super(FlowNet2,self).__init__()
        self.batchNorm = batchNorm
        self.div_flow = div_flow
        self.rgb_max = args.rgb_max
        self.args = args

        self.channelnorm = ChannelNorm()

        # First Block (FlowNetC)
        self.flownetc = FlowNetC.FlowNetC(args, batchNorm=self.batchNorm)
        self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')

        if args.fp16:
            self.resample1 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample1 = Resample2d()

        # Block (FlowNetS1)
        self.flownets_1 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm)
        self.upsample2 = nn.Upsample(scale_factor=4, mode='bilinear')
        if args.fp16:
            self.resample2 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample2 = Resample2d()


        # Block (FlowNetS2)
        self.flownets_2 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm)

        # Block (FlowNetSD)
        self.flownets_d = FlowNetSD.FlowNetSD(args, batchNorm=self.batchNorm) 
        self.upsample3 = nn.Upsample(scale_factor=4, mode='nearest') 
        self.upsample4 = nn.Upsample(scale_factor=4, mode='nearest') 

        if args.fp16:
            self.resample3 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample3 = Resample2d()

        if args.fp16:
            self.resample4 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample4 = Resample2d()

        # Block (FLowNetFusion)
        self.flownetfusion = FlowNetFusion.FlowNetFusion(args, batchNorm=self.batchNorm)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)
                # init_deconv_bilinear(m.weight)

    def init_deconv_bilinear(self, weight):
        f_shape = weight.size()
        heigh, width = f_shape[-2], f_shape[-1]
        f = np.ceil(width/2.0)
        c = (2 * f - 1 - f % 2) / (2.0 * f)
        bilinear = np.zeros([heigh, width])
        for x in range(width):
            for y in range(heigh):
                value = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
                bilinear[x, y] = value
        min_dim = min(f_shape[0], f_shape[1])
        weight.data.fill_(0.)
        for i in range(min_dim):
            weight.data[i,i,:,:] = torch.from_numpy(bilinear)
        return 

    def forward(self, inputs):
        rgb_mean = inputs.contiguous().view(inputs.size()[:2]+(-1,)).mean(dim=-1).view(inputs.size()[:2] + (1,1,1,))
        
        x = (inputs - rgb_mean) / self.rgb_max
        x1 = x[:,:,0,:,:]
        x2 = x[:,:,1,:,:]
        x = torch.cat((x1,x2), dim = 1)

        # flownetc
        flownetc_flow2 = self.flownetc(x)[0]
        flownetc_flow = self.upsample1(flownetc_flow2*self.div_flow)
        
        # warp img1 to img0; magnitude of diff between img0 and and warped_img1, 
        resampled_img1 = self.resample1(x[:,3:,:,:], flownetc_flow)
        diff_img0 = x[:,:3,:,:] - resampled_img1 
        norm_diff_img0 = self.channelnorm(diff_img0)

        # concat img0, img1, img1->img0, flow, diff-mag ; 
        concat1 = torch.cat((x, resampled_img1, flownetc_flow/self.div_flow, norm_diff_img0), dim=1)
        
        # flownets1
        flownets1_flow2 = self.flownets_1(concat1)[0]
        flownets1_flow = self.upsample2(flownets1_flow2*self.div_flow) 

        # warp img1 to img0 using flownets1; magnitude of diff between img0 and and warped_img1
        resampled_img1 = self.resample2(x[:,3:,:,:], flownets1_flow)
        diff_img0 = x[:,:3,:,:] - resampled_img1
        norm_diff_img0 = self.channelnorm(diff_img0)

        # concat img0, img1, img1->img0, flow, diff-mag
        concat2 = torch.cat((x, resampled_img1, flownets1_flow/self.div_flow, norm_diff_img0), dim=1)

        # flownets2
        flownets2_flow2 = self.flownets_2(concat2)[0]
        flownets2_flow = self.upsample4(flownets2_flow2 * self.div_flow)
        norm_flownets2_flow = self.channelnorm(flownets2_flow)

        diff_flownets2_flow = self.resample4(x[:,3:,:,:], flownets2_flow)
        # if not diff_flownets2_flow.volatile:
        #     diff_flownets2_flow.register_hook(save_grad(self.args.grads, 'diff_flownets2_flow'))

        diff_flownets2_img1 = self.channelnorm((x[:,:3,:,:]-diff_flownets2_flow))
        # if not diff_flownets2_img1.volatile:
        #     diff_flownets2_img1.register_hook(save_grad(self.args.grads, 'diff_flownets2_img1'))

        # flownetsd
        flownetsd_flow2 = self.flownets_d(x)[0]
        flownetsd_flow = self.upsample3(flownetsd_flow2 / self.div_flow)
        norm_flownetsd_flow = self.channelnorm(flownetsd_flow)
        
        diff_flownetsd_flow = self.resample3(x[:,3:,:,:], flownetsd_flow)
        # if not diff_flownetsd_flow.volatile:
        #     diff_flownetsd_flow.register_hook(save_grad(self.args.grads, 'diff_flownetsd_flow'))

        diff_flownetsd_img1 = self.channelnorm((x[:,:3,:,:]-diff_flownetsd_flow))
        # if not diff_flownetsd_img1.volatile:
        #     diff_flownetsd_img1.register_hook(save_grad(self.args.grads, 'diff_flownetsd_img1'))

        # concat img1 flownetsd, flownets2, norm_flownetsd, norm_flownets2, diff_flownetsd_img1, diff_flownets2_img1
        concat3 = torch.cat((x[:,:3,:,:], flownetsd_flow, flownets2_flow, norm_flownetsd_flow, norm_flownets2_flow, diff_flownetsd_img1, diff_flownets2_img1), dim=1)
        flownetfusion_flow = self.flownetfusion(concat3)

        # if not flownetfusion_flow.volatile:
        #     flownetfusion_flow.register_hook(save_grad(self.args.grads, 'flownetfusion_flow'))

        return flownetfusion_flow

class FlowNet2C(FlowNetC.FlowNetC):
    def __init__(self, args, batchNorm=False, div_flow=20):
        super(FlowNet2C,self).__init__(args, batchNorm=batchNorm, div_flow=20)
        self.rgb_max = args.rgb_max

    def forward(self, inputs):
        rgb_mean = inputs.contiguous().view(inputs.size()[:2]+(-1,)).mean(dim=-1).view(inputs.size()[:2] + (1,1,1,))
        
        x = (inputs - rgb_mean) / self.rgb_max
        x1 = x[:,:,0,:,:]
        x2 = x[:,:,1,:,:]

        # FlownetC top input stream
        out_conv1a = self.conv1(x1)
        out_conv2a = self.conv2(out_conv1a)
        out_conv3a = self.conv3(out_conv2a)

        # FlownetC bottom input stream
        out_conv1b = self.conv1(x2)
        
        out_conv2b = self.conv2(out_conv1b)
        out_conv3b = self.conv3(out_conv2b)

        # Merge streams
        out_corr = self.corr(out_conv3a, out_conv3b) # False
        out_corr = self.corr_activation(out_corr)

        # Redirect top input stream and concatenate
        out_conv_redir = self.conv_redir(out_conv3a)

        in_conv3_1 = torch.cat((out_conv_redir, out_corr), 1)

        # Merged conv layers
        out_conv3_1 = self.conv3_1(in_conv3_1)

        out_conv4 = self.conv4_1(self.conv4(out_conv3_1))

        out_conv5 = self.conv5_1(self.conv5(out_conv4))
        out_conv6 = self.conv6_1(self.conv6(out_conv5))

        flow6       = self.predict_flow6(out_conv6)
        flow6_up    = self.upsampled_flow6_to_5(flow6)
        out_deconv5 = self.deconv5(out_conv6)

        concat5 = torch.cat((out_conv5,out_deconv5,flow6_up),1)

        flow5       = self.predict_flow5(concat5)
        flow5_up    = self.upsampled_flow5_to_4(flow5)
        out_deconv4 = self.deconv4(concat5)
        concat4 = torch.cat((out_conv4,out_deconv4,flow5_up),1)

        flow4       = self.predict_flow4(concat4)
        flow4_up    = self.upsampled_flow4_to_3(flow4)
        out_deconv3 = self.deconv3(concat4)
        concat3 = torch.cat((out_conv3_1,out_deconv3,flow4_up),1)

        flow3       = self.predict_flow3(concat3)
        flow3_up    = self.upsampled_flow3_to_2(flow3)
        out_deconv2 = self.deconv2(concat3)
        concat2 = torch.cat((out_conv2a,out_deconv2,flow3_up),1)

        flow2 = self.predict_flow2(concat2)

        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return self.upsample1(flow2*self.div_flow)

class FlowNet2S(FlowNetS.FlowNetS):
    def __init__(self, args, batchNorm=False, div_flow=20):
        super(FlowNet2S,self).__init__(args, input_channels = 6, batchNorm=batchNorm)
        self.rgb_max = args.rgb_max
        self.div_flow = div_flow
        
    def forward(self, inputs):
        rgb_mean = inputs.contiguous().view(inputs.size()[:2]+(-1,)).mean(dim=-1).view(inputs.size()[:2] + (1,1,1,))
        x = (inputs - rgb_mean) / self.rgb_max
        x = torch.cat( (x[:,:,0,:,:], x[:,:,1,:,:]), dim = 1)

        out_conv1 = self.conv1(x)

        out_conv2 = self.conv2(out_conv1)
        out_conv3 = self.conv3_1(self.conv3(out_conv2))
        out_conv4 = self.conv4_1(self.conv4(out_conv3))
        out_conv5 = self.conv5_1(self.conv5(out_conv4))
        out_conv6 = self.conv6_1(self.conv6(out_conv5))

        flow6       = self.predict_flow6(out_conv6)
        flow6_up    = self.upsampled_flow6_to_5(flow6)
        out_deconv5 = self.deconv5(out_conv6)
        
        concat5 = torch.cat((out_conv5,out_deconv5,flow6_up),1)
        flow5       = self.predict_flow5(concat5)
        flow5_up    = self.upsampled_flow5_to_4(flow5)
        out_deconv4 = self.deconv4(concat5)
        
        concat4 = torch.cat((out_conv4,out_deconv4,flow5_up),1)
        flow4       = self.predict_flow4(concat4)
        flow4_up    = self.upsampled_flow4_to_3(flow4)
        out_deconv3 = self.deconv3(concat4)
        
        concat3 = torch.cat((out_conv3,out_deconv3,flow4_up),1)
        flow3       = self.predict_flow3(concat3)
        flow3_up    = self.upsampled_flow3_to_2(flow3)
        out_deconv2 = self.deconv2(concat3)

        concat2 = torch.cat((out_conv2,out_deconv2,flow3_up),1)
        flow2 = self.predict_flow2(concat2)

        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return self.upsample1(flow2*self.div_flow)

class FlowNet2SD(FlowNetSD.FlowNetSD):
    def __init__(self, args, batchNorm=False, div_flow=20):
        super(FlowNet2SD,self).__init__(args, batchNorm=batchNorm)
        self.rgb_max = args.rgb_max
        self.div_flow = div_flow

    def forward(self, inputs):
        rgb_mean = inputs.contiguous().view(inputs.size()[:2]+(-1,)).mean(dim=-1).view(inputs.size()[:2] + (1,1,1,))
        x = (inputs - rgb_mean) / self.rgb_max
        x = torch.cat( (x[:,:,0,:,:], x[:,:,1,:,:]), dim = 1)

        out_conv0 = self.conv0(x)
        out_conv1 = self.conv1_1(self.conv1(out_conv0))
        out_conv2 = self.conv2_1(self.conv2(out_conv1))

        out_conv3 = self.conv3_1(self.conv3(out_conv2))
        out_conv4 = self.conv4_1(self.conv4(out_conv3))
        out_conv5 = self.conv5_1(self.conv5(out_conv4))
        out_conv6 = self.conv6_1(self.conv6(out_conv5))

        flow6       = self.predict_flow6(out_conv6)
        flow6_up    = self.upsampled_flow6_to_5(flow6)
        out_deconv5 = self.deconv5(out_conv6)
        
        concat5 = torch.cat((out_conv5,out_deconv5,flow6_up),1)
        out_interconv5 = self.inter_conv5(concat5)
        flow5       = self.predict_flow5(out_interconv5)

        flow5_up    = self.upsampled_flow5_to_4(flow5)
        out_deconv4 = self.deconv4(concat5)
        
        concat4 = torch.cat((out_conv4,out_deconv4,flow5_up),1)
        out_interconv4 = self.inter_conv4(concat4)
        flow4       = self.predict_flow4(out_interconv4)
        flow4_up    = self.upsampled_flow4_to_3(flow4)
        out_deconv3 = self.deconv3(concat4)
        
        concat3 = torch.cat((out_conv3,out_deconv3,flow4_up),1)
        out_interconv3 = self.inter_conv3(concat3)
        flow3       = self.predict_flow3(out_interconv3)
        flow3_up    = self.upsampled_flow3_to_2(flow3)
        out_deconv2 = self.deconv2(concat3)

        concat2 = torch.cat((out_conv2,out_deconv2,flow3_up),1)
        out_interconv2 = self.inter_conv2(concat2)
        flow2 = self.predict_flow2(out_interconv2)

        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return self.upsample1(flow2*self.div_flow)

class FlowNet2CS(nn.Module):

    def __init__(self, args, batchNorm=False, div_flow = 20.):
        super(FlowNet2CS,self).__init__()
        self.batchNorm = batchNorm
        self.div_flow = div_flow
        self.rgb_max = args.rgb_max
        self.args = args

        self.channelnorm = ChannelNorm()

        # First Block (FlowNetC)
        self.flownetc = FlowNetC.FlowNetC(args, batchNorm=self.batchNorm)
        self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')

        if args.fp16:
            self.resample1 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample1 = Resample2d()

        # Block (FlowNetS1)
        self.flownets_1 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm)
        self.upsample2 = nn.Upsample(scale_factor=4, mode='bilinear')

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform(m.bias)
                init.xavier_uniform(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform(m.bias)
                init.xavier_uniform(m.weight)
                # init_deconv_bilinear(m.weight)

    def forward(self, inputs):
        rgb_mean = inputs.contiguous().view(inputs.size()[:2]+(-1,)).mean(dim=-1).view(inputs.size()[:2] + (1,1,1,))
        
        x = (inputs - rgb_mean) / self.rgb_max
        x1 = x[:,:,0,:,:]
        x2 = x[:,:,1,:,:]
        x = torch.cat((x1,x2), dim = 1)

        # flownetc
        flownetc_flow2 = self.flownetc(x)[0]
        flownetc_flow = self.upsample1(flownetc_flow2*self.div_flow)
        
        # warp img1 to img0; magnitude of diff between img0 and and warped_img1, 
        resampled_img1 = self.resample1(x[:,3:,:,:], flownetc_flow)
        diff_img0 = x[:,:3,:,:] - resampled_img1 
        norm_diff_img0 = self.channelnorm(diff_img0)

        # concat img0, img1, img1->img0, flow, diff-mag ; 
        concat1 = torch.cat((x, resampled_img1, flownetc_flow/self.div_flow, norm_diff_img0), dim=1)
        
        # flownets1
        flownets1_flow2 = self.flownets_1(concat1)[0]
        flownets1_flow = self.upsample2(flownets1_flow2*self.div_flow) 

        return flownets1_flow

class FlowNet2CSS(nn.Module):

    def __init__(self, args, batchNorm=False, div_flow = 20.):
        super(FlowNet2CSS,self).__init__()
        self.batchNorm = batchNorm
        self.div_flow = div_flow
        self.rgb_max = args.rgb_max
        self.args = args

        self.channelnorm = ChannelNorm()

        # First Block (FlowNetC)
        self.flownetc = FlowNetC.FlowNetC(args, batchNorm=self.batchNorm)
        self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')

        if args.fp16:
            self.resample1 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample1 = Resample2d()

        # Block (FlowNetS1)
        self.flownets_1 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm)
        self.upsample2 = nn.Upsample(scale_factor=4, mode='bilinear')
        if args.fp16:
            self.resample2 = nn.Sequential(
                            tofp32(), 
                            Resample2d(),
                            tofp16()) 
        else:
            self.resample2 = Resample2d()


        # Block (FlowNetS2)
        self.flownets_2 = FlowNetS.FlowNetS(args, batchNorm=self.batchNorm)
        self.upsample3 = nn.Upsample(scale_factor=4, mode='nearest') 

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform(m.bias)
                init.xavier_uniform(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform(m.bias)
                init.xavier_uniform(m.weight)
                # init_deconv_bilinear(m.weight)

    def forward(self, inputs):
        rgb_mean = inputs.contiguous().view(inputs.size()[:2]+(-1,)).mean(dim=-1).view(inputs.size()[:2] + (1,1,1,))
        
        x = (inputs - rgb_mean) / self.rgb_max
        x1 = x[:,:,0,:,:]
        x2 = x[:,:,1,:,:]
        x = torch.cat((x1,x2), dim = 1)

        # flownetc
        flownetc_flow2 = self.flownetc(x)[0]
        flownetc_flow = self.upsample1(flownetc_flow2*self.div_flow)
        
        # warp img1 to img0; magnitude of diff between img0 and and warped_img1, 
        resampled_img1 = self.resample1(x[:,3:,:,:], flownetc_flow)
        diff_img0 = x[:,:3,:,:] - resampled_img1 
        norm_diff_img0 = self.channelnorm(diff_img0)

        # concat img0, img1, img1->img0, flow, diff-mag ; 
        concat1 = torch.cat((x, resampled_img1, flownetc_flow/self.div_flow, norm_diff_img0), dim=1)
        
        # flownets1
        flownets1_flow2 = self.flownets_1(concat1)[0]
        flownets1_flow = self.upsample2(flownets1_flow2*self.div_flow) 

        # warp img1 to img0 using flownets1; magnitude of diff between img0 and and warped_img1
        resampled_img1 = self.resample2(x[:,3:,:,:], flownets1_flow)
        diff_img0 = x[:,:3,:,:] - resampled_img1
        norm_diff_img0 = self.channelnorm(diff_img0)

        # concat img0, img1, img1->img0, flow, diff-mag
        concat2 = torch.cat((x, resampled_img1, flownets1_flow/self.div_flow, norm_diff_img0), dim=1)

        # flownets2
        flownets2_flow2 = self.flownets_2(concat2)[0]
        flownets2_flow = self.upsample3(flownets2_flow2 * self.div_flow)

        return flownets2_flow


================================================
FILE: dvs/flownet2/networks/FlowNetC.py
================================================
import torch
import torch.nn as nn
from torch.nn import init

import math
import numpy as np

from .correlation_package.correlation import Correlation

from .submodules import *
'Parameter count , 39,175,298 '

class FlowNetC(nn.Module):
    def __init__(self,args, batchNorm=True, div_flow = 20):
        super(FlowNetC,self).__init__()

        self.batchNorm = batchNorm
        self.div_flow = div_flow

        self.conv1   = conv(self.batchNorm,   3,   64, kernel_size=7, stride=2)
        self.conv2   = conv(self.batchNorm,  64,  128, kernel_size=5, stride=2)
        self.conv3   = conv(self.batchNorm, 128,  256, kernel_size=5, stride=2)
        self.conv_redir  = conv(self.batchNorm, 256,   32, kernel_size=1, stride=1)

        if args.fp16:
            self.corr = nn.Sequential(
                tofp32(),
                Correlation(pad_size=20, kernel_size=1, max_displacement=20, stride1=1, stride2=2, corr_multiply=1),
                tofp16())
        else:
            self.corr = Correlation(pad_size=20, kernel_size=1, max_displacement=20, stride1=1, stride2=2, corr_multiply=1)

        self.corr_activation = nn.LeakyReLU(0.1,inplace=True)
        self.conv3_1 = conv(self.batchNorm, 473,  256)
        self.conv4   = conv(self.batchNorm, 256,  512, stride=2)
        self.conv4_1 = conv(self.batchNorm, 512,  512)
        self.conv5   = conv(self.batchNorm, 512,  512, stride=2)
        self.conv5_1 = conv(self.batchNorm, 512,  512)
        self.conv6   = conv(self.batchNorm, 512, 1024, stride=2)
        self.conv6_1 = conv(self.batchNorm,1024, 1024)

        self.deconv5 = deconv(1024,512)
        self.deconv4 = deconv(1026,256)
        self.deconv3 = deconv(770,128)
        self.deconv2 = deconv(386,64)

        self.predict_flow6 = predict_flow(1024)
        self.predict_flow5 = predict_flow(1026)
        self.predict_flow4 = predict_flow(770)
        self.predict_flow3 = predict_flow(386)
        self.predict_flow2 = predict_flow(194)

        self.upsampled_flow6_to_5 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True)
        self.upsampled_flow5_to_4 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True)
        self.upsampled_flow4_to_3 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True)
        self.upsampled_flow3_to_2 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=True)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)
                # init_deconv_bilinear(m.weight)
        self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')

    def forward(self, x):
        x1 = x[:,0:3,:,:]
        x2 = x[:,3::,:,:]

        out_conv1a = self.conv1(x1)
        out_conv2a = self.conv2(out_conv1a)
        out_conv3a = self.conv3(out_conv2a)

        # FlownetC bottom input stream
        out_conv1b = self.conv1(x2)
        
        out_conv2b = self.conv2(out_conv1b)
        out_conv3b = self.conv3(out_conv2b)

        # Merge streams
        out_corr = self.corr(out_conv3a, out_conv3b) # False
        out_corr = self.corr_activation(out_corr)

        # Redirect top input stream and concatenate
        out_conv_redir = self.conv_redir(out_conv3a)

        in_conv3_1 = torch.cat((out_conv_redir, out_corr), 1)

        # Merged conv layers
        out_conv3_1 = self.conv3_1(in_conv3_1)

        out_conv4 = self.conv4_1(self.conv4(out_conv3_1))

        out_conv5 = self.conv5_1(self.conv5(out_conv4))
        out_conv6 = self.conv6_1(self.conv6(out_conv5))

        flow6       = self.predict_flow6(out_conv6)
        flow6_up    = self.upsampled_flow6_to_5(flow6)
        out_deconv5 = self.deconv5(out_conv6)

        concat5 = torch.cat((out_conv5,out_deconv5,flow6_up),1)

        flow5       = self.predict_flow5(concat5)
        flow5_up    = self.upsampled_flow5_to_4(flow5)
        out_deconv4 = self.deconv4(concat5)
        concat4 = torch.cat((out_conv4,out_deconv4,flow5_up),1)

        flow4       = self.predict_flow4(concat4)
        flow4_up    = self.upsampled_flow4_to_3(flow4)
        out_deconv3 = self.deconv3(concat4)
        concat3 = torch.cat((out_conv3_1,out_deconv3,flow4_up),1)

        flow3       = self.predict_flow3(concat3)
        flow3_up    = self.upsampled_flow3_to_2(flow3)
        out_deconv2 = self.deconv2(concat3)
        concat2 = torch.cat((out_conv2a,out_deconv2,flow3_up),1)

        flow2 = self.predict_flow2(concat2)

        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return flow2,


================================================
FILE: dvs/flownet2/networks/FlowNetFusion.py
================================================
import torch
import torch.nn as nn
from torch.nn import init

import math
import numpy as np

from .submodules import *
'Parameter count = 581,226'

class FlowNetFusion(nn.Module):
    def __init__(self,args, batchNorm=True):
        super(FlowNetFusion,self).__init__()

        self.batchNorm = batchNorm
        self.conv0   = conv(self.batchNorm,  11,   64)
        self.conv1   = conv(self.batchNorm,  64,   64, stride=2)
        self.conv1_1 = conv(self.batchNorm,  64,   128)
        self.conv2   = conv(self.batchNorm,  128,  128, stride=2)
        self.conv2_1 = conv(self.batchNorm,  128,  128)

        self.deconv1 = deconv(128,32)
        self.deconv0 = deconv(162,16)

        self.inter_conv1 = i_conv(self.batchNorm,  162,   32)
        self.inter_conv0 = i_conv(self.batchNorm,  82,   16)

        self.predict_flow2 = predict_flow(128)
        self.predict_flow1 = predict_flow(32)
        self.predict_flow0 = predict_flow(16)

        self.upsampled_flow2_to_1 = nn.ConvTranspose2d(2, 2, 4, 2, 1)
        self.upsampled_flow1_to_0 = nn.ConvTranspose2d(2, 2, 4, 2, 1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)
                # init_deconv_bilinear(m.weight)

    def forward(self, x):
        out_conv0 = self.conv0(x)
        out_conv1 = self.conv1_1(self.conv1(out_conv0))
        out_conv2 = self.conv2_1(self.conv2(out_conv1))

        flow2       = self.predict_flow2(out_conv2)
        flow2_up    = self.upsampled_flow2_to_1(flow2)
        out_deconv1 = self.deconv1(out_conv2)
        
        concat1 = torch.cat((out_conv1,out_deconv1,flow2_up),1)
        out_interconv1 = self.inter_conv1(concat1)
        flow1       = self.predict_flow1(out_interconv1)
        flow1_up    = self.upsampled_flow1_to_0(flow1)
        out_deconv0 = self.deconv0(concat1)
        
        concat0 = torch.cat((out_conv0,out_deconv0,flow1_up),1)
        out_interconv0 = self.inter_conv0(concat0)
        flow0       = self.predict_flow0(out_interconv0)

        return flow0


================================================
FILE: dvs/flownet2/networks/FlowNetS.py
================================================
'''
Portions of this code copyright 2017, Clement Pinard
'''

import torch
import torch.nn as nn
from torch.nn import init

import math
import numpy as np

from .submodules import *
'Parameter count : 38,676,504 '

class FlowNetS(nn.Module):
    def __init__(self, args, input_channels = 12, batchNorm=True):
        super(FlowNetS,self).__init__()

        self.batchNorm = batchNorm
        self.conv1   = conv(self.batchNorm,  input_channels,   64, kernel_size=7, stride=2)
        self.conv2   = conv(self.batchNorm,  64,  128, kernel_size=5, stride=2)
        self.conv3   = conv(self.batchNorm, 128,  256, kernel_size=5, stride=2)
        self.conv3_1 = conv(self.batchNorm, 256,  256)
        self.conv4   = conv(self.batchNorm, 256,  512, stride=2)
        self.conv4_1 = conv(self.batchNorm, 512,  512)
        self.conv5   = conv(self.batchNorm, 512,  512, stride=2)
        self.conv5_1 = conv(self.batchNorm, 512,  512)
        self.conv6   = conv(self.batchNorm, 512, 1024, stride=2)
        self.conv6_1 = conv(self.batchNorm,1024, 1024)

        self.deconv5 = deconv(1024,512)
        self.deconv4 = deconv(1026,256)
        self.deconv3 = deconv(770,128)
        self.deconv2 = deconv(386,64)

        self.predict_flow6 = predict_flow(1024)
        self.predict_flow5 = predict_flow(1026)
        self.predict_flow4 = predict_flow(770)
        self.predict_flow3 = predict_flow(386)
        self.predict_flow2 = predict_flow(194)

        self.upsampled_flow6_to_5 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False)
        self.upsampled_flow5_to_4 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False)
        self.upsampled_flow4_to_3 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False)
        self.upsampled_flow3_to_2 = nn.ConvTranspose2d(2, 2, 4, 2, 1, bias=False)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)
                # init_deconv_bilinear(m.weight)
        self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')

    def forward(self, x):
        out_conv1 = self.conv1(x)

        out_conv2 = self.conv2(out_conv1)
        out_conv3 = self.conv3_1(self.conv3(out_conv2))
        out_conv4 = self.conv4_1(self.conv4(out_conv3))
        out_conv5 = self.conv5_1(self.conv5(out_conv4))
        out_conv6 = self.conv6_1(self.conv6(out_conv5))

        flow6       = self.predict_flow6(out_conv6)
        flow6_up    = self.upsampled_flow6_to_5(flow6)
        out_deconv5 = self.deconv5(out_conv6)
        
        concat5 = torch.cat((out_conv5,out_deconv5,flow6_up),1)
        flow5       = self.predict_flow5(concat5)
        flow5_up    = self.upsampled_flow5_to_4(flow5)
        out_deconv4 = self.deconv4(concat5)
        
        concat4 = torch.cat((out_conv4,out_deconv4,flow5_up),1)
        flow4       = self.predict_flow4(concat4)
        flow4_up    = self.upsampled_flow4_to_3(flow4)
        out_deconv3 = self.deconv3(concat4)
        
        concat3 = torch.cat((out_conv3,out_deconv3,flow4_up),1)
        flow3       = self.predict_flow3(concat3)
        flow3_up    = self.upsampled_flow3_to_2(flow3)
        out_deconv2 = self.deconv2(concat3)

        concat2 = torch.cat((out_conv2,out_deconv2,flow3_up),1)
        flow2 = self.predict_flow2(concat2)

        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return flow2,


================================================
FILE: dvs/flownet2/networks/FlowNetSD.py
================================================
import torch
import torch.nn as nn
from torch.nn import init

import math
import numpy as np

from .submodules import *
'Parameter count = 45,371,666'

class FlowNetSD(nn.Module):
    def __init__(self, args, batchNorm=True):
        super(FlowNetSD,self).__init__()

        self.batchNorm = batchNorm
        self.conv0   = conv(self.batchNorm,  6,   64)
        self.conv1   = conv(self.batchNorm,  64,   64, stride=2)
        self.conv1_1 = conv(self.batchNorm,  64,   128)
        self.conv2   = conv(self.batchNorm,  128,  128, stride=2)
        self.conv2_1 = conv(self.batchNorm,  128,  128)
        self.conv3   = conv(self.batchNorm, 128,  256, stride=2)
        self.conv3_1 = conv(self.batchNorm, 256,  256)
        self.conv4   = conv(self.batchNorm, 256,  512, stride=2)
        self.conv4_1 = conv(self.batchNorm, 512,  512)
        self.conv5   = conv(self.batchNorm, 512,  512, stride=2)
        self.conv5_1 = conv(self.batchNorm, 512,  512)
        self.conv6   = conv(self.batchNorm, 512, 1024, stride=2)
        self.conv6_1 = conv(self.batchNorm,1024, 1024)

        self.deconv5 = deconv(1024,512)
        self.deconv4 = deconv(1026,256)
        self.deconv3 = deconv(770,128)
        self.deconv2 = deconv(386,64)

        self.inter_conv5 = i_conv(self.batchNorm,  1026,   512)
        self.inter_conv4 = i_conv(self.batchNorm,  770,   256)
        self.inter_conv3 = i_conv(self.batchNorm,  386,   128)
        self.inter_conv2 = i_conv(self.batchNorm,  194,   64)

        self.predict_flow6 = predict_flow(1024)
        self.predict_flow5 = predict_flow(512)
        self.predict_flow4 = predict_flow(256)
        self.predict_flow3 = predict_flow(128)
        self.predict_flow2 = predict_flow(64)

        self.upsampled_flow6_to_5 = nn.ConvTranspose2d(2, 2, 4, 2, 1)
        self.upsampled_flow5_to_4 = nn.ConvTranspose2d(2, 2, 4, 2, 1)
        self.upsampled_flow4_to_3 = nn.ConvTranspose2d(2, 2, 4, 2, 1)
        self.upsampled_flow3_to_2 = nn.ConvTranspose2d(2, 2, 4, 2, 1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)

            if isinstance(m, nn.ConvTranspose2d):
                if m.bias is not None:
                    init.uniform_(m.bias)
                init.xavier_uniform_(m.weight)
                # init_deconv_bilinear(m.weight)
        self.upsample1 = nn.Upsample(scale_factor=4, mode='bilinear')


    def forward(self, x):
        out_conv0 = self.conv0(x)
        out_conv1 = self.conv1_1(self.conv1(out_conv0))
        out_conv2 = self.conv2_1(self.conv2(out_conv1))

        out_conv3 = self.conv3_1(self.conv3(out_conv2))
        out_conv4 = self.conv4_1(self.conv4(out_conv3))
        out_conv5 = self.conv5_1(self.conv5(out_conv4))
        out_conv6 = self.conv6_1(self.conv6(out_conv5))

        flow6       = self.predict_flow6(out_conv6)
        flow6_up    = self.upsampled_flow6_to_5(flow6)
        out_deconv5 = self.deconv5(out_conv6)
        
        concat5 = torch.cat((out_conv5,out_deconv5,flow6_up),1)
        out_interconv5 = self.inter_conv5(concat5)
        flow5       = self.predict_flow5(out_interconv5)

        flow5_up    = self.upsampled_flow5_to_4(flow5)
        out_deconv4 = self.deconv4(concat5)
        
        concat4 = torch.cat((out_conv4,out_deconv4,flow5_up),1)
        out_interconv4 = self.inter_conv4(concat4)
        flow4       = self.predict_flow4(out_interconv4)
        flow4_up    = self.upsampled_flow4_to_3(flow4)
        out_deconv3 = self.deconv3(concat4)
        
        concat3 = torch.cat((out_conv3,out_deconv3,flow4_up),1)
        out_interconv3 = self.inter_conv3(concat3)
        flow3       = self.predict_flow3(out_interconv3)
        flow3_up    = self.upsampled_flow3_to_2(flow3)
        out_deconv2 = self.deconv2(concat3)

        concat2 = torch.cat((out_conv2,out_deconv2,flow3_up),1)
        out_interconv2 = self.inter_conv2(concat2)
        flow2 = self.predict_flow2(out_interconv2)

        if self.training:
            return flow2,flow3,flow4,flow5,flow6
        else:
            return flow2,


================================================
FILE: dvs/flownet2/networks/__init__.py
================================================


================================================
FILE: dvs/flownet2/networks/channelnorm_package/__init__.py
================================================


================================================
FILE: dvs/flownet2/networks/channelnorm_package/channelnorm.py
================================================
from torch.autograd import Function, Variable
from torch.nn.modules.module import Module
import channelnorm_cuda

class ChannelNormFunction(Function):

    @staticmethod
    def forward(ctx, input1, norm_deg=2):
        assert input1.is_contiguous()
        b, _, h, w = input1.size()
        output = input1.new(b, 1, h, w).zero_()

        channelnorm_cuda.forward(input1, output, norm_deg)
        ctx.save_for_backward(input1, output)
        ctx.norm_deg = norm_deg

        return output

    @staticmethod
    def backward(ctx, grad_output):
        input1, output = ctx.saved_tensors

        grad_input1 = Variable(input1.new(input1.size()).zero_())

        channelnorm_cuda.backward(input1, output, grad_output.data,
                                              grad_input1.data, ctx.norm_deg)

        return grad_input1, None


class ChannelNorm(Module):

    def __init__(self, norm_deg=2):
        super(ChannelNorm, self).__init__()
        self.norm_deg = norm_deg

    def forward(self, input1):
        return ChannelNormFunction.apply(input1, self.norm_deg)


================================================
FILE: dvs/flownet2/networks/channelnorm_package/channelnorm_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>

#include "channelnorm_kernel.cuh"

int channelnorm_cuda_forward(
    at::Tensor& input1, 
    at::Tensor& output,
    int norm_deg) {

    channelnorm_kernel_forward(input1, output, norm_deg);
    return 1;
}


int channelnorm_cuda_backward(
    at::Tensor& input1, 
    at::Tensor& output,
    at::Tensor& gradOutput,
    at::Tensor& gradInput1,
    int norm_deg) {

    channelnorm_kernel_backward(input1, output, gradOutput, gradInput1, norm_deg);
    return 1;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &channelnorm_cuda_forward, "Channel norm forward (CUDA)");
  m.def("backward", &channelnorm_cuda_backward, "Channel norm backward (CUDA)");
}


================================================
FILE: dvs/flownet2/networks/channelnorm_package/channelnorm_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <ATen/cuda/CUDAContext.h>

#include "channelnorm_kernel.cuh"

#define CUDA_NUM_THREADS 512 

#define DIM0(TENSOR) ((TENSOR).x)
#define DIM1(TENSOR) ((TENSOR).y)
#define DIM2(TENSOR) ((TENSOR).z)
#define DIM3(TENSOR) ((TENSOR).w)

#define DIM3_INDEX(TENSOR, xx, yy, zz, ww) ((TENSOR)[((xx) * (TENSOR##_stride.x)) + ((yy) * (TENSOR##_stride.y)) + ((zz) * (TENSOR##_stride.z)) + ((ww) * (TENSOR##_stride.w))])

using at::Half;

template <typename scalar_t>
__global__ void kernel_channelnorm_update_output(
    const int n, 
    const scalar_t* __restrict__ input1,
    const long4 input1_size,
    const long4 input1_stride,
    scalar_t* __restrict__ output, 
    const long4 output_size,
    const long4 output_stride,
    int norm_deg) {

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index >= n) {
        return;
    }

    int dim_b = DIM0(output_size);
    int dim_c = DIM1(output_size);
    int dim_h = DIM2(output_size);
    int dim_w = DIM3(output_size);
    int dim_chw = dim_c * dim_h * dim_w;

    int b = ( index / dim_chw ) % dim_b;
    int y = ( index / dim_w )   % dim_h;
    int x = ( index          )  % dim_w;

    int i1dim_c = DIM1(input1_size);
    int i1dim_h = DIM2(input1_size);
    int i1dim_w = DIM3(input1_size);
    int i1dim_chw = i1dim_c * i1dim_h * i1dim_w;
    int i1dim_hw  = i1dim_h * i1dim_w;

    float result = 0.0;

    for (int c = 0; c < i1dim_c; ++c) {
        int i1Index = b * i1dim_chw + c * i1dim_hw + y * i1dim_w + x;
        scalar_t val = input1[i1Index];
        result += static_cast<float>(val * val);
    }
    result = sqrt(result);
    output[index] = static_cast<scalar_t>(result);
}


template <typename scalar_t>
__global__ void kernel_channelnorm_backward_input1(
    const int n,
    const scalar_t* __restrict__ input1, const long4 input1_size, const long4 input1_stride,
    const scalar_t* __restrict__ output, const long4 output_size, const long4 output_stride, 
    const scalar_t* __restrict__ gradOutput, const long4 gradOutput_size, const long4 gradOutput_stride,
    scalar_t* __restrict__ gradInput, const long4 gradInput_size, const long4 gradInput_stride, 
    int norm_deg) {

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index >= n) {
        return;
    }

    float val = 0.0;

    int dim_b = DIM0(gradInput_size);
    int dim_c = DIM1(gradInput_size);
    int dim_h = DIM2(gradInput_size);
    int dim_w = DIM3(gradInput_size);
    int dim_chw = dim_c * dim_h * dim_w;
    int dim_hw  = dim_h * dim_w;

    int b = ( index / dim_chw ) % dim_b;
    int y = ( index / dim_w )   % dim_h;
    int x = ( index          )  % dim_w;


    int outIndex = b * dim_hw + y * dim_w + x;
    val = static_cast<float>(gradOutput[outIndex]) * static_cast<float>(input1[index]) / (static_cast<float>(output[outIndex])+1e-9);
    gradInput[index] = static_cast<scalar_t>(val);

}

void channelnorm_kernel_forward(
    at::Tensor& input1, 
    at::Tensor& output, 
    int norm_deg) {

    const long4 input1_size = make_long4(input1.size(0), input1.size(1), input1.size(2), input1.size(3));
    const long4 input1_stride = make_long4(input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3));

    const long4 output_size = make_long4(output.size(0), output.size(1), output.size(2), output.size(3));
    const long4 output_stride = make_long4(output.stride(0), output.stride(1), output.stride(2), output.stride(3));

    int n = output.numel();

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channelnorm_forward", ([&] {

      kernel_channelnorm_update_output<scalar_t><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
          n,
          input1.data<scalar_t>(), 
          input1_size,
          input1_stride, 
          output.data<scalar_t>(),
          output_size,
          output_stride, 
          norm_deg);

    }));

      // TODO: ATen-equivalent check

     // THCudaCheck(cudaGetLastError());
}

void channelnorm_kernel_backward(
    at::Tensor& input1, 
    at::Tensor& output,
    at::Tensor& gradOutput, 
    at::Tensor& gradInput1, 
    int norm_deg) {

    const long4 input1_size = make_long4(input1.size(0), input1.size(1), input1.size(2), input1.size(3));
    const long4 input1_stride = make_long4(input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3));

    const long4 output_size = make_long4(output.size(0), output.size(1), output.size(2), output.size(3));
    const long4 output_stride = make_long4(output.stride(0), output.stride(1), output.stride(2), output.stride(3));

    const long4 gradOutput_size = make_long4(gradOutput.size(0), gradOutput.size(1), gradOutput.size(2), gradOutput.size(3));
    const long4 gradOutput_stride = make_long4(gradOutput.stride(0), gradOutput.stride(1), gradOutput.stride(2), gradOutput.stride(3));

    const long4 gradInput1_size = make_long4(gradInput1.size(0), gradInput1.size(1), gradInput1.size(2), gradInput1.size(3));
    const long4 gradInput1_stride = make_long4(gradInput1.stride(0), gradInput1.stride(1), gradInput1.stride(2), gradInput1.stride(3));

    int n = gradInput1.numel();

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channelnorm_backward_input1", ([&] {

      kernel_channelnorm_backward_input1<scalar_t><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
          n, 
          input1.data<scalar_t>(),
          input1_size,
          input1_stride,
          output.data<scalar_t>(),
          output_size,
          output_stride,
          gradOutput.data<scalar_t>(),
          gradOutput_size,
          gradOutput_stride, 
          gradInput1.data<scalar_t>(),
          gradInput1_size,
          gradInput1_stride,
          norm_deg
    );

    }));

    // TODO: Add ATen-equivalent check

//    THCudaCheck(cudaGetLastError());
}


================================================
FILE: dvs/flownet2/networks/channelnorm_package/channelnorm_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>

void channelnorm_kernel_forward(
    at::Tensor& input1,
    at::Tensor& output, 
    int norm_deg);


void channelnorm_kernel_backward(
    at::Tensor& input1,
    at::Tensor& output,
    at::Tensor& gradOutput,
    at::Tensor& gradInput1,
    int norm_deg);


================================================
FILE: dvs/flownet2/networks/channelnorm_package/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

cxx_args = ['-std=c++11']

nvcc_args = [
    '-gencode', 'arch=compute_52,code=sm_52',
    '-gencode', 'arch=compute_60,code=sm_60',
    '-gencode', 'arch=compute_61,code=sm_61',
    '-gencode', 'arch=compute_70,code=sm_70',
    '-gencode', 'arch=compute_70,code=compute_70'
]

setup(
    name='channelnorm_cuda',
    ext_modules=[
        CUDAExtension('channelnorm_cuda', [
            'channelnorm_cuda.cc',
            'channelnorm_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: dvs/flownet2/networks/correlation_package/__init__.py
================================================


================================================
FILE: dvs/flownet2/networks/correlation_package/correlation.py
================================================
import torch
from torch.nn.modules.module import Module
from torch.autograd import Function
import correlation_cuda

class CorrelationFunction(Function):

    @staticmethod
    def forward(ctx, input1, input2, pad_size=3, kernel_size=3, max_displacement=20, stride1=1, stride2=2, corr_multiply=1):
        ctx.save_for_backward(input1, input2)

        ctx.pad_size = pad_size
        ctx.kernel_size = kernel_size
        ctx.max_displacement = max_displacement
        ctx.stride1 = stride1
        ctx.stride2 = stride2
        ctx.corr_multiply = corr_multiply

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()
            output = input1.new()

            correlation_cuda.forward(input1, input2, rbot1, rbot2, output,
                ctx.pad_size, ctx.kernel_size, ctx.max_displacement, ctx.stride1, ctx.stride2, ctx.corr_multiply)

        return output

    @staticmethod
    def backward(ctx, grad_output):
        input1, input2 = ctx.saved_tensors

        with torch.cuda.device_of(input1):
            rbot1 = input1.new()
            rbot2 = input2.new()

            grad_input1 = input1.new()
            grad_input2 = input2.new()

            correlation_cuda.backward(input1, input2, rbot1, rbot2, grad_output, grad_input1, grad_input2,
                ctx.pad_size, ctx.kernel_size, ctx.max_displacement, ctx.stride1, ctx.stride2, ctx.corr_multiply)

        return grad_input1, grad_input2, None, None, None, None, None, None


class Correlation(Module):
    def __init__(self, pad_size=0, kernel_size=0, max_displacement=0, stride1=1, stride2=2, corr_multiply=1):
        super(Correlation, self).__init__()
        self.pad_size = pad_size
        self.kernel_size = kernel_size
        self.max_displacement = max_displacement
        self.stride1 = stride1
        self.stride2 = stride2
        self.corr_multiply = corr_multiply

    def forward(self, input1, input2):

        result = CorrelationFunction.apply(input1, input2, self.pad_size, self.kernel_size, self.max_displacement, self.stride1, self.stride2, self.corr_multiply)

        return result


================================================
FILE: dvs/flownet2/networks/correlation_package/correlation_cuda.cc
================================================
#include <torch/torch.h>
#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <ATen/cuda/CUDAContext.h>
#include <stdio.h>
#include <iostream>

#include "correlation_cuda_kernel.cuh"

int correlation_forward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& output,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);

  int nInputChannels = input1.size(1);
  int inputHeight = input1.size(2);
  int inputWidth = input1.size(3);

  int kernel_radius = (kernel_size - 1) / 2;
  int border_radius = kernel_radius + max_displacement;

  int paddedInputHeight = inputHeight + 2 * pad_size;
  int paddedInputWidth = inputWidth + 2 * pad_size;

  int nOutputChannels = ((max_displacement/stride2)*2 + 1) * ((max_displacement/stride2)*2 + 1);

  int outputHeight = ceil(static_cast<float>(paddedInputHeight - 2 * border_radius) / static_cast<float>(stride1));
  int outputwidth = ceil(static_cast<float>(paddedInputWidth - 2 * border_radius) / static_cast<float>(stride1));

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  output.resize_({batchSize, nOutputChannels, outputHeight, outputwidth});

  rInput1.fill_(0);
  rInput2.fill_(0);
  output.fill_(0);

  int success = correlation_forward_cuda_kernel(
    output,
    output.size(0), 
    output.size(1),
    output.size(2),
    output.size(3),
    output.stride(0),
    output.stride(1),
    output.stride(2),
    output.stride(3),
    input1,
    input1.size(1),
    input1.size(2),
    input1.size(3),
    input1.stride(0),
    input1.stride(1),
    input1.stride(2),
    input1.stride(3),
    input2,
    input2.size(1),
    input2.stride(0),
    input2.stride(1),
    input2.stride(2),
    input2.stride(3),
    rInput1,
    rInput2,
    pad_size,     
    kernel_size,
    max_displacement,
    stride1,
    stride2,
    corr_type_multiply,
	at::cuda::getCurrentCUDAStream()
    //at::globalContext().getCurrentCUDAStream()
  );

  //check for errors
  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;

}

int correlation_backward_cuda(at::Tensor& input1, at::Tensor& input2, at::Tensor& rInput1, at::Tensor& rInput2, at::Tensor& gradOutput, 
                       at::Tensor& gradInput1, at::Tensor& gradInput2,
                       int pad_size,
                       int kernel_size,
                       int max_displacement,
                       int stride1,
                       int stride2,
                       int corr_type_multiply)
{

  int batchSize = input1.size(0);
  int nInputChannels = input1.size(1);
  int paddedInputHeight = input1.size(2)+ 2 * pad_size;
  int paddedInputWidth = input1.size(3)+ 2 * pad_size;

  int height = input1.size(2);
  int width = input1.size(3);

  rInput1.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  rInput2.resize_({batchSize, paddedInputHeight, paddedInputWidth, nInputChannels});
  gradInput1.resize_({batchSize, nInputChannels, height, width});
  gradInput2.resize_({batchSize, nInputChannels, height, width});

  rInput1.fill_(0);
  rInput2.fill_(0);
  gradInput1.fill_(0);
  gradInput2.fill_(0);

  int success = correlation_backward_cuda_kernel(gradOutput,
                                                gradOutput.size(0),
                                                gradOutput.size(1),
                                                gradOutput.size(2),
                                                gradOutput.size(3),
                                                gradOutput.stride(0),
                                                gradOutput.stride(1),
                                                gradOutput.stride(2),
                                                gradOutput.stride(3),
                                                input1,
                                                input1.size(1),
                                                input1.size(2),
                                                input1.size(3),
                                                input1.stride(0),
                                                input1.stride(1),
                                                input1.stride(2),
                                                input1.stride(3),
                                                input2,  
                                                input2.stride(0),
                                                input2.stride(1),
                                                input2.stride(2),
                                                input2.stride(3),
                                                gradInput1,
                                                gradInput1.stride(0),
                                                gradInput1.stride(1),
                                                gradInput1.stride(2),
                                                gradInput1.stride(3),
                                                gradInput2,
                                                gradInput2.size(1),
                                                gradInput2.stride(0),
                                                gradInput2.stride(1),
                                                gradInput2.stride(2),
                                                gradInput2.stride(3),
                                                rInput1,
                                                rInput2,
                                                pad_size,
                                                kernel_size,
                                                max_displacement,
                                                stride1, 
                                                stride2,
                                                corr_type_multiply,
												at::cuda::getCurrentCUDAStream()
                                                //at::globalContext().getCurrentCUDAStream()
                                               );

  if (!success) {
    AT_ERROR("CUDA call failed");
  }

  return 1;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &correlation_forward_cuda, "Correlation forward (CUDA)");
  m.def("backward", &correlation_backward_cuda, "Correlation backward (CUDA)");
}


================================================
FILE: dvs/flownet2/networks/correlation_package/correlation_cuda_kernel.cu
================================================
#include <stdio.h>

#include "correlation_cuda_kernel.cuh"

#define CUDA_NUM_THREADS 1024
#define THREADS_PER_BLOCK 32
#define FULL_MASK 0xffffffff

#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

using at::Half;

template<typename scalar_t>
__forceinline__ __device__ scalar_t warpReduceSum(scalar_t val) {
        for (int offset = 16; offset > 0; offset /= 2)
                val += __shfl_down_sync(FULL_MASK, val, offset);
        return val;
}

template<typename scalar_t>
__forceinline__ __device__ scalar_t blockReduceSum(scalar_t val) {

        static __shared__ scalar_t shared[32];
        int lane = threadIdx.x % warpSize;
        int wid = threadIdx.x / warpSize;

        val = warpReduceSum(val);

        if (lane == 0)
                shared[wid] = val;

        __syncthreads();

        val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

        if (wid == 0)
                val = warpReduceSum(val);

        return val;
}


template <typename scalar_t>
__global__ void channels_first(const scalar_t* __restrict__ input, scalar_t* rinput, int channels, int height, int width, int pad_size)
{

    // n (batch size), c (num of channels), y (height), x (width)
    int n = blockIdx.x;
    int y = blockIdx.y;
    int x = blockIdx.z;

    int ch_off = threadIdx.x;
    scalar_t value;

    int dimcyx = channels * height * width;
    int dimyx = height * width;

    int p_dimx = (width + 2 * pad_size);
    int p_dimy = (height + 2 * pad_size);
    int p_dimyxc = channels * p_dimy * p_dimx;
    int p_dimxc = p_dimx * channels;

    for (int c = ch_off; c < channels; c += THREADS_PER_BLOCK) {
      value = input[n * dimcyx + c * dimyx + y * width + x];
      rinput[n * p_dimyxc + (y + pad_size) * p_dimxc + (x + pad_size) * channels + c] = value;
    }
}


template<typename scalar_t>
__global__ void correlation_forward(scalar_t* __restrict__ output, const int nOutputChannels,
                const int outputHeight, const int outputWidth, const scalar_t* __restrict__ rInput1,
                const int nInputChannels, const int inputHeight, const int inputWidth,
                const scalar_t* __restrict__ rInput2, const int pad_size, const int kernel_size,
                const int max_displacement, const int stride1, const int stride2) {

        int32_t pInputWidth = inputWidth + 2 * pad_size;
        int32_t pInputHeight = inputHeight + 2 * pad_size;

        int32_t kernel_rad = (kernel_size - 1) / 2;

        int32_t displacement_rad = max_displacement / stride2;

        int32_t displacement_size = 2 * displacement_rad + 1;

        int32_t n = blockIdx.x;
        int32_t y1 = blockIdx.y * stride1 + max_displacement;
        int32_t x1 = blockIdx.z * stride1 + max_displacement;
        int32_t c = threadIdx.x;

        int32_t pdimyxc = pInputHeight * pInputWidth * nInputChannels;

        int32_t pdimxc = pInputWidth * nInputChannels;

        int32_t pdimc = nInputChannels;

        int32_t tdimcyx = nOutputChannels * outputHeight * outputWidth;
        int32_t tdimyx = outputHeight * outputWidth;
        int32_t tdimx = outputWidth;

        int32_t nelems = kernel_size * kernel_size * pdimc;

        // element-wise product along channel axis
        for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
                for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
                        int x2 = x1 + ti * stride2;
                        int y2 = y1 + tj * stride2;

                        float acc0 = 0.0f;

                        for (int j = -kernel_rad; j <= kernel_rad; ++j) {
                                for (int i = -kernel_rad; i <= kernel_rad; ++i) {
                                        // THREADS_PER_BLOCK
                                        #pragma unroll
                                        for (int ch = c; ch < pdimc; ch += blockDim.x) {

                                                int indx1 = n * pdimyxc + (y1 + j) * pdimxc
                                                                + (x1 + i) * pdimc + ch;
                                                int indx2 = n * pdimyxc + (y2 + j) * pdimxc
                                                                + (x2 + i) * pdimc + ch;
                                                acc0 += static_cast<float>(rInput1[indx1] * rInput2[indx2]);
                                        }
                                }
                        }

                        if (blockDim.x == warpSize) {
                            __syncwarp();
                            acc0 = warpReduceSum(acc0);
                        } else {
                            __syncthreads();
                            acc0 = blockReduceSum(acc0);
                        }

                        if (threadIdx.x == 0) {

                                int tc = (tj + displacement_rad) * displacement_size
                                                + (ti + displacement_rad);
                                const int tindx = n * tdimcyx + tc * tdimyx + blockIdx.y * tdimx
                                                + blockIdx.z;
                                output[tindx] = static_cast<scalar_t>(acc0 / nelems);
                        }
            }
        }
}


template <typename scalar_t>
__global__ void correlation_backward_input1(int item, scalar_t* gradInput1, int nInputChannels, int inputHeight, int inputWidth, 
                                            const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth, 
                                            const scalar_t* __restrict__ rInput2, 
                                            int pad_size,
                                            int kernel_size,
                                            int max_displacement,
                                            int stride1,
                                            int stride2)
  {
    // n (batch size), c (num of channels), y (height), x (width)

    int n = item; 
    int y = blockIdx.x * stride1 + pad_size;
    int x = blockIdx.y * stride1 + pad_size;
    int c = blockIdx.z;
    int tch_off = threadIdx.x;

    int kernel_rad = (kernel_size - 1) / 2;
    int displacement_rad = max_displacement / stride2;
    int displacement_size = 2 * displacement_rad + 1;

    int xmin = (x - kernel_rad - max_displacement) / stride1;
    int ymin = (y - kernel_rad - max_displacement) / stride1;

    int xmax = (x + kernel_rad - max_displacement) / stride1;
    int ymax = (y + kernel_rad - max_displacement) / stride1;

    if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
        // assumes gradInput1 is pre-allocated and zero filled
      return;
    }

    if (xmin > xmax || ymin > ymax) {
        // assumes gradInput1 is pre-allocated and zero filled
        return;
    }

    xmin = max(0,xmin);
    xmax = min(outputWidth-1,xmax);

    ymin = max(0,ymin);
    ymax = min(outputHeight-1,ymax);

    int pInputWidth = inputWidth + 2 * pad_size;
    int pInputHeight = inputHeight + 2 * pad_size;

    int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
    int pdimxc = pInputWidth * nInputChannels;
    int pdimc = nInputChannels;

    int tdimcyx = nOutputChannels * outputHeight * outputWidth;
    int tdimyx = outputHeight * outputWidth;
    int tdimx = outputWidth;

    int odimcyx = nInputChannels * inputHeight* inputWidth;
    int odimyx = inputHeight * inputWidth;
    int odimx = inputWidth;

    scalar_t nelems = kernel_size * kernel_size * nInputChannels;

    __shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
    prod_sum[tch_off] = 0;

    for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {

      int i2 = (tc % displacement_size - displacement_rad) * stride2;
      int j2 = (tc / displacement_size - displacement_rad) * stride2;

      int indx2 = n * pdimyxc + (y + j2)* pdimxc + (x + i2) * pdimc + c;
      
      scalar_t val2 = rInput2[indx2];

      for (int j = ymin; j <= ymax; ++j) {
        for (int i = xmin; i <= xmax; ++i) {
          int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
          prod_sum[tch_off] += gradOutput[tindx] * val2;
        }
      }
    }
    __syncthreads();

    if(tch_off == 0) {
      scalar_t reduce_sum = 0;
      for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
          reduce_sum += prod_sum[idx];
      }
      const int indx1 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
      gradInput1[indx1] = reduce_sum / nelems;
    }

}

template <typename scalar_t>
__global__ void correlation_backward_input2(int item, scalar_t*  gradInput2, int nInputChannels, int inputHeight, int inputWidth,
                                            const scalar_t* __restrict__ gradOutput, int nOutputChannels, int outputHeight, int outputWidth,
                                            const scalar_t* __restrict__ rInput1,
                                            int pad_size,
                                            int kernel_size,
                                            int max_displacement,
                                            int stride1,
                                            int stride2)
{
    // n (batch size), c (num of channels), y (height), x (width)

    int n = item;
    int y = blockIdx.x * stride1 + pad_size;
    int x = blockIdx.y * stride1 + pad_size;
    int c = blockIdx.z;

    int tch_off = threadIdx.x;

    int kernel_rad = (kernel_size - 1) / 2;
    int displacement_rad = max_displacement / stride2;
    int displacement_size = 2 * displacement_rad + 1;

    int pInputWidth = inputWidth + 2 * pad_size;
    int pInputHeight = inputHeight + 2 * pad_size;

    int pdimyxc = pInputHeight * pInputWidth * nInputChannels;
    int pdimxc = pInputWidth * nInputChannels;
    int pdimc = nInputChannels;

    int tdimcyx = nOutputChannels * outputHeight * outputWidth;
    int tdimyx = outputHeight * outputWidth;
    int tdimx = outputWidth;

    int odimcyx = nInputChannels * inputHeight* inputWidth;
    int odimyx = inputHeight * inputWidth;
    int odimx = inputWidth;

    scalar_t nelems = kernel_size * kernel_size * nInputChannels;

    __shared__ scalar_t prod_sum[THREADS_PER_BLOCK];
    prod_sum[tch_off] = 0;

    for (int tc = tch_off; tc < nOutputChannels; tc += THREADS_PER_BLOCK) {
      int i2 = (tc % displacement_size - displacement_rad) * stride2;
      int j2 = (tc / displacement_size - displacement_rad) * stride2;

      int xmin = (x - kernel_rad - max_displacement - i2) / stride1;
      int ymin = (y - kernel_rad - max_displacement - j2) / stride1;

      int xmax = (x + kernel_rad - max_displacement - i2) / stride1;
      int ymax = (y + kernel_rad - max_displacement - j2) / stride1;

      if (xmax < 0 || ymax < 0 || xmin >= outputWidth || ymin >= outputHeight) {
          // assumes gradInput2 is pre-allocated and zero filled
        continue;
      }

      if (xmin > xmax || ymin > ymax) {
          // assumes gradInput2 is pre-allocated and zero filled
          continue;
      }

      xmin = max(0,xmin);
      xmax = min(outputWidth-1,xmax);

      ymin = max(0,ymin);
      ymax = min(outputHeight-1,ymax);
      
      int indx1 = n * pdimyxc + (y - j2)* pdimxc + (x - i2) * pdimc + c;
      scalar_t val1 = rInput1[indx1];

      for (int j = ymin; j <= ymax; ++j) {
        for (int i = xmin; i <= xmax; ++i) {
          int tindx = n * tdimcyx + tc * tdimyx + j * tdimx + i;
          prod_sum[tch_off] += gradOutput[tindx] * val1;
        }
      }
    }

    __syncthreads();

    if(tch_off == 0) {
      scalar_t reduce_sum = 0;
      for(int idx = 0; idx < THREADS_PER_BLOCK; idx++) {
          reduce_sum += prod_sum[idx];
      }
      const int indx2 = n * odimcyx + c * odimyx + (y - pad_size) * odimx + (x - pad_size);
      gradInput2[indx2] = reduce_sum / nelems;
    }

}

int correlation_forward_cuda_kernel(at::Tensor& output,
                                    int ob,
                                    int oc,
                                    int oh,
                                    int ow,
                                    int osb,
                                    int osc,
                                    int osh,
                                    int osw,

                                    at::Tensor& input1,
                                    int ic,
                                    int ih,
                                    int iw,
                                    int isb,
                                    int isc,
                                    int ish,
                                    int isw,

                                    at::Tensor& input2,
                                    int gc,
                                    int gsb,
                                    int gsc,
                                    int gsh,
                                    int gsw,

                                    at::Tensor& rInput1,
                                    at::Tensor& rInput2,
                                    int pad_size,
                                    int kernel_size,
                                    int max_displacement,
                                    int stride1,
                                    int stride2,
                                    int corr_type_multiply,
                                    cudaStream_t stream) 
{

   int batchSize = ob;

   int nInputChannels = ic;
   int inputWidth = iw;
   int inputHeight = ih;

   int nOutputChannels = oc;
   int outputWidth = ow;
   int outputHeight = oh;

   dim3 blocks_grid(batchSize, inputHeight, inputWidth);
   dim3 threads_block(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channels_first_fwd_1", ([&] {

  channels_first<scalar_t><<<blocks_grid,threads_block, 0, stream>>>(
      input1.data<scalar_t>(), rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

  }));

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "channels_first_fwd_2", ([&] {

  channels_first<scalar_t><<<blocks_grid,threads_block, 0, stream>>> (
      input2.data<scalar_t>(), rInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth, pad_size);

  }));

   dim3 threadsPerBlock(THREADS_PER_BLOCK);
   dim3 totalBlocksCorr(batchSize, outputHeight, outputWidth);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "correlation_forward", ([&] {

   correlation_forward<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>> 
                        (output.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
                         rInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
                         rInput2.data<scalar_t>(),
                         pad_size,
                         kernel_size,
                         max_displacement,
                         stride1,
                         stride2);

  }));

  cudaError_t err = cudaGetLastError();


  // check for errors
  if (err != cudaSuccess) {
    printf("error in correlation_forward_cuda_kernel: %s\n", cudaGetErrorString(err));
    return 0;
  }

  return 1;
}


int correlation_backward_cuda_kernel(
                                    at::Tensor& gradOutput,
                                    int gob,
                                    int goc,
                                    int goh,
                                    int gow,
                                    int gosb,
                                    int gosc,
                                    int gosh,
                                    int gosw,

                                    at::Tensor& input1,
                                    int ic,
                                    int ih,
                                    int iw,
                                    int isb,
                                    int isc,
                                    int ish,
                                    int isw,

                                    at::Tensor& input2,
                                    int gsb,
                                    int gsc,
                                    int gsh,
                                    int gsw,

                                    at::Tensor& gradInput1,
                                    int gisb,
                                    int gisc,
                                    int gish,
                                    int gisw,

                                    at::Tensor& gradInput2,
                                    int ggc,
                                    int ggsb,
                                    int ggsc,
                                    int ggsh,
                                    int ggsw,

                                    at::Tensor& rInput1,
                                    at::Tensor& rInput2,
                                    int pad_size,
                                    int kernel_size,
                                    int max_displacement,
                                    int stride1,
                                    int stride2,
                                    int corr_type_multiply,
                                    cudaStream_t stream)
{

    int batchSize = gob;
    int num = batchSize;

    int nInputChannels = ic;
    int inputWidth = iw;
    int inputHeight = ih;

    int nOutputChannels = goc;
    int outputWidth = gow;
    int outputHeight = goh;

    dim3 blocks_grid(batchSize, inputHeight, inputWidth);
    dim3 threads_block(THREADS_PER_BLOCK);


    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "lltm_forward_cuda", ([&] {

        channels_first<scalar_t><<<blocks_grid, threads_block, 0, stream>>>(
            input1.data<scalar_t>(),
            rInput1.data<scalar_t>(),
            nInputChannels,
            inputHeight,
            inputWidth,
            pad_size
        );
    }));

    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {

        channels_first<scalar_t><<<blocks_grid, threads_block, 0, stream>>>(
            input2.data<scalar_t>(),
            rInput2.data<scalar_t>(),
            nInputChannels,
            inputHeight,
            inputWidth,
            pad_size
        );
    }));

    dim3 threadsPerBlock(THREADS_PER_BLOCK);
    dim3 totalBlocksCorr(inputHeight, inputWidth, nInputChannels);

    for (int n = 0; n < num; ++n) {

      AT_DISPATCH_FLOATING_TYPES_AND_HALF(input2.type(), "lltm_forward_cuda", ([&] {


          correlation_backward_input1<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>> (
              n, gradInput1.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
              gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
              rInput2.data<scalar_t>(),
              pad_size,
              kernel_size,
              max_displacement,
              stride1,
              stride2);
      }));
    }

    for(int n = 0; n < batchSize; n++) {

      AT_DISPATCH_FLOATING_TYPES_AND_HALF(rInput1.type(), "lltm_forward_cuda", ([&] {

        correlation_backward_input2<scalar_t><<<totalBlocksCorr, threadsPerBlock, 0, stream>>>(
            n, gradInput2.data<scalar_t>(), nInputChannels, inputHeight, inputWidth,
            gradOutput.data<scalar_t>(), nOutputChannels, outputHeight, outputWidth,
            rInput1.data<scalar_t>(),
            pad_size,
            kernel_size,
            max_displacement,
            stride1,
            stride2);

        }));
    }

  // check for errors
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("error in correlation_backward_cuda_kernel: %s\n", cudaGetErrorString(err));
    return 0;
  }

  return 1;
}


================================================
FILE: dvs/flownet2/networks/correlation_package/correlation_cuda_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <cuda_runtime.h>

int correlation_forward_cuda_kernel(at::Tensor& output,
    int ob,
    int oc,
    int oh,
    int ow,
    int osb,
    int osc,
    int osh,
    int osw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gc,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


int correlation_backward_cuda_kernel(   
    at::Tensor& gradOutput,
    int gob,
    int goc,
    int goh,
    int gow,
    int gosb,
    int gosc,
    int gosh,
    int gosw,

    at::Tensor& input1,
    int ic,
    int ih,
    int iw,
    int isb,
    int isc,
    int ish,
    int isw,

    at::Tensor& input2,
    int gsb,
    int gsc,
    int gsh,
    int gsw,

    at::Tensor& gradInput1, 
    int gisb,
    int gisc,
    int gish,
    int gisw,

    at::Tensor& gradInput2,
    int ggc,
    int ggsb,
    int ggsc,
    int ggsh,
    int ggsw,

    at::Tensor& rInput1,
    at::Tensor& rInput2,
    int pad_size,
    int kernel_size,
    int max_displacement,
    int stride1,
    int stride2,
    int corr_type_multiply,
    cudaStream_t stream);


================================================
FILE: dvs/flownet2/networks/correlation_package/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup, find_packages
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

cxx_args = ['-std=c++11']

nvcc_args = [
    '-gencode', 'arch=compute_50,code=sm_50',
    '-gencode', 'arch=compute_52,code=sm_52',
    '-gencode', 'arch=compute_60,code=sm_60',
    '-gencode', 'arch=compute_61,code=sm_61',
    '-gencode', 'arch=compute_70,code=sm_70',
    '-gencode', 'arch=compute_70,code=compute_70'
]

setup(
    name='correlation_cuda',
    ext_modules=[
        CUDAExtension('correlation_cuda', [
            'correlation_cuda.cc',
            'correlation_cuda_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: dvs/flownet2/networks/resample2d_package/__init__.py
================================================


================================================
FILE: dvs/flownet2/networks/resample2d_package/resample2d.py
================================================
from torch.nn.modules.module import Module
from torch.autograd import Function, Variable
import resample2d_cuda

class Resample2dFunction(Function):

    @staticmethod
    def forward(ctx, input1, input2, kernel_size=1, bilinear= True):
        assert input1.is_contiguous()
        assert input2.is_contiguous()

        ctx.save_for_backward(input1, input2)
        ctx.kernel_size = kernel_size
        ctx.bilinear = bilinear

        _, d, _, _ = input1.size()
        b, _, h, w = input2.size()
        output = input1.new(b, d, h, w).zero_()

        resample2d_cuda.forward(input1, input2, output, kernel_size, bilinear)

        return output

    @staticmethod
    def backward(ctx, grad_output):
        grad_output = grad_output.contiguous()
        assert grad_output.is_contiguous()

        input1, input2 = ctx.saved_tensors

        grad_input1 = Variable(input1.new(input1.size()).zero_())
        grad_input2 = Variable(input1.new(input2.size()).zero_())

        resample2d_cuda.backward(input1, input2, grad_output.data,
                                 grad_input1.data, grad_input2.data,
                                 ctx.kernel_size, ctx.bilinear)

        return grad_input1, grad_input2, None, None

class Resample2d(Module):

    def __init__(self, kernel_size=1, bilinear = True):
        super(Resample2d, self).__init__()
        self.kernel_size = kernel_size
        self.bilinear = bilinear

    def forward(self, input1, input2):
        input1_c = input1.contiguous()
        return Resample2dFunction.apply(input1_c, input2, self.kernel_size, self.bilinear)


================================================
FILE: dvs/flownet2/networks/resample2d_package/resample2d_cuda.cc
================================================
#include <ATen/ATen.h>
#include <torch/torch.h>

#include "resample2d_kernel.cuh"

int resample2d_cuda_forward(
    at::Tensor& input1,
    at::Tensor& input2, 
    at::Tensor& output,
    int kernel_size, bool bilinear) {
      resample2d_kernel_forward(input1, input2, output, kernel_size, bilinear);
    return 1;
}

int resample2d_cuda_backward(
    at::Tensor& input1, 
    at::Tensor& input2,
    at::Tensor& gradOutput,
    at::Tensor& gradInput1, 
    at::Tensor& gradInput2, 
    int kernel_size, bool bilinear) {
        resample2d_kernel_backward(input1, input2, gradOutput, gradInput1, gradInput2, kernel_size, bilinear);
    return 1;
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &resample2d_cuda_forward, "Resample2D forward (CUDA)");
  m.def("backward", &resample2d_cuda_backward, "Resample2D backward (CUDA)");
}


================================================
FILE: dvs/flownet2/networks/resample2d_package/resample2d_kernel.cu
================================================
#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <ATen/cuda/CUDAContext.h>

#define CUDA_NUM_THREADS 512 
#define THREADS_PER_BLOCK 64 

#define DIM0(TENSOR) ((TENSOR).x)
#define DIM1(TENSOR) ((TENSOR).y)
#define DIM2(TENSOR) ((TENSOR).z)
#define DIM3(TENSOR) ((TENSOR).w)

#define DIM3_INDEX(TENSOR, xx, yy, zz, ww) ((TENSOR)[((xx) * (TENSOR##_stride.x)) + ((yy) * (TENSOR##_stride.y)) + ((zz) * (TENSOR##_stride.z)) + ((ww) * (TENSOR##_stride.w))])

template <typename scalar_t>
__global__ void kernel_resample2d_update_output(const int n, 
                                               const scalar_t* __restrict__ input1, const long4 input1_size, const long4 input1_stride,
                                               const scalar_t* __restrict__ input2, const long4 input2_size, const long4 input2_stride, 
                                               scalar_t* __restrict__ output, const long4 output_size, const long4 output_stride, int kernel_size, bool bilinear) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index >= n) {
        return;
    }

    scalar_t val = 0.0f;

    int dim_b = DIM0(output_size);
    int dim_c = DIM1(output_size);
    int dim_h = DIM2(output_size);
    int dim_w = DIM3(output_size);
    int dim_chw = dim_c * dim_h * dim_w;
    int dim_hw  = dim_h * dim_w;

    int b = ( index / dim_chw ) % dim_b;
    int c = ( index / dim_hw )  % dim_c;
    int y = ( index / dim_w )   % dim_h;
    int x = ( index          )  % dim_w;

    scalar_t dx = DIM3_INDEX(input2, b, 0, y, x);
    scalar_t dy = DIM3_INDEX(input2, b, 1, y, x);

    scalar_t xf = static_cast<scalar_t>(x) + dx;
    scalar_t yf = static_cast<scalar_t>(y) + dy;
    scalar_t alpha = xf - floor(xf); // alpha
    scalar_t beta = yf - floor(yf); // beta

    if (bilinear) {
        int xL = max(min( int (floor(xf)),    dim_w-1), 0);
        int xR = max(min( int (floor(xf)+1), dim_w -1), 0);
        int yT = max(min( int (floor(yf)),    dim_h-1), 0);
        int yB = max(min( int (floor(yf)+1),  dim_h-1), 0);

        for (int fy = 0; fy < kernel_size; fy += 1) {
            for (int fx = 0; fx < kernel_size; fx += 1) {
                val += static_cast<float>((1. - alpha)*(1. - beta) * DIM3_INDEX(input1, b, c, yT + fy, xL + fx));
                val += static_cast<float>((alpha)*(1. - beta) * DIM3_INDEX(input1, b, c, yT + fy, xR + fx));
                val += static_cast<float>((1. - alpha)*(beta) * DIM3_INDEX(input1, b, c, yB + fy, xL + fx));
                val += static_cast<float>((alpha)*(beta) * DIM3_INDEX(input1, b, c, yB + fy, xR + fx));
            }
        }

        output[index] = val;
    }
    else {
        int xN = max(min( int (floor(xf + 0.5)), dim_w - 1), 0);
        int yN = max(min( int (floor(yf + 0.5)), dim_h - 1), 0);

        output[index] = static_cast<float> ( DIM3_INDEX(input1, b, c, yN, xN) );
    }

}


template <typename scalar_t>
__global__ void kernel_resample2d_backward_input1(
    const int n, const scalar_t* __restrict__ input1, const long4 input1_size, const long4 input1_stride,
    const scalar_t* __restrict__ input2, const long4 input2_size, const long4 input2_stride,
    const scalar_t* __restrict__ gradOutput, const long4 gradOutput_size, const long4 gradOutput_stride,
    scalar_t* __restrict__ gradInput, const long4 gradInput_size, const long4 gradInput_stride, int kernel_size, bool bilinear) {

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index >= n) {
        return;
    }

    int dim_b = DIM0(gradOutput_size);
    int dim_c = DIM1(gradOutput_size);
    int dim_h = DIM2(gradOutput_size);
    int dim_w = DIM3(gradOutput_size);
    int dim_chw = dim_c * dim_h * dim_w;
    int dim_hw  = dim_h * dim_w;

    int b = ( index / dim_chw ) % dim_b;
    int c = ( index / dim_hw )  % dim_c;
    int y = ( index / dim_w )   % dim_h;
    int x = ( index          )  % dim_w;

    scalar_t dx = DIM3_INDEX(input2, b, 0, y, x);
    scalar_t dy = DIM3_INDEX(input2, b, 1, y, x);

    scalar_t xf = static_cast<scalar_t>(x) + dx;
    scalar_t yf = static_cast<scalar_t>(y) + dy;
    scalar_t alpha = xf - int(xf); // alpha
    scalar_t beta = yf - int(yf); // beta

    int idim_h = DIM2(input1_size);
    int idim_w = DIM3(input1_size);

    int xL = max(min( int (floor(xf)),    idim_w-1), 0);
    int xR = max(min( int (floor(xf)+1), idim_w -1), 0);
    int yT = max(min( int (floor(yf)),    idim_h-1), 0);
    int yB = max(min( int (floor(yf)+1),  idim_h-1), 0);

    for (int fy = 0; fy < kernel_size; fy += 1) {
        for (int fx = 0; fx < kernel_size; fx += 1) {
            atomicAdd(&DIM3_INDEX(gradInput, b, c, (yT + fy), (xL + fx)), (1-alpha)*(1-beta) * DIM3_INDEX(gradOutput, b, c, y, x));
            atomicAdd(&DIM3_INDEX(gradInput, b, c, (yT + fy), (xR + fx)),   (alpha)*(1-beta) * DIM3_INDEX(gradOutput, b, c, y, x));
            atomicAdd(&DIM3_INDEX(gradInput, b, c, (yB + fy), (xL + fx)),   (1-alpha)*(beta) * DIM3_INDEX(gradOutput, b, c, y, x));
            atomicAdd(&DIM3_INDEX(gradInput, b, c, (yB + fy), (xR + fx)),     (alpha)*(beta) * DIM3_INDEX(gradOutput, b, c, y, x));
        }
    }

}

template <typename scalar_t>
__global__ void kernel_resample2d_backward_input2(
    const int n, const scalar_t* __restrict__ input1, const long4 input1_size, const long4 input1_stride,
    const scalar_t* __restrict__ input2, const long4 input2_size, const long4 input2_stride,
    const scalar_t* __restrict__ gradOutput, const long4 gradOutput_size, const long4 gradOutput_stride,
    scalar_t* __restrict__ gradInput, const long4 gradInput_size, const long4 gradInput_stride, int kernel_size, bool bilinear) {

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    if (index >= n) {
        return;
    }

    scalar_t output = 0.0;
    int kernel_rad = (kernel_size - 1)/2;

    int dim_b = DIM0(gradInput_size);
    int dim_c = DIM1(gradInput_size);
    int dim_h = DIM2(gradInput_size);
    int dim_w = DIM3(gradInput_size);
    int dim_chw = dim_c * dim_h * dim_w;
    int dim_hw  = dim_h * dim_w;

    int b = ( index / dim_chw ) % dim_b;
    int c = ( index / dim_hw )  % dim_c;
    int y = ( index / dim_w )   % dim_h;
    int x = ( index          )  % dim_w;

    int odim_c = DIM1(gradOutput_size);

    scalar_t dx = DIM3_INDEX(input2, b, 0, y, x);
    scalar_t dy = DIM3_INDEX(input2, b, 1, y, x);

    scalar_t xf = static_cast<scalar_t>(x) + dx;
    scalar_t yf = static_cast<scalar_t>(y) + dy;

    int xL = max(min( int (floor(xf)),    dim_w-1), 0);
    int xR = max(min( int (floor(xf)+1), dim_w -1), 0);
    int yT = max(min( int (floor(yf)),    dim_h-1), 0);
    int yB = max(min( int (floor(yf)+1),  dim_h-1), 0);
    
    if (c % 2) {
        float gamma = 1 - (xf - floor(xf)); // alpha
        for (int i = 0; i <= 2*kernel_rad; ++i) {
            for (int j = 0; j <= 2*kernel_rad; ++j) {
                for (int ch = 0; ch < odim_c; ++ch) {
                    output += (gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yB + j), (xL + i));
                    output -= (gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yT + j), (xL + i));
                    output += (1-gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yB + j), (xR + i));
                    output -= (1-gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yT + j), (xR + i));
                }
            }
        }
    }
    else {
        float gamma = 1 - (yf - floor(yf)); // alpha
        for (int i = 0; i <= 2*kernel_rad; ++i) {
            for (int j = 0; j <= 2*kernel_rad; ++j) {
                for (int ch = 0; ch < odim_c; ++ch) {
                    output += (gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yT + j), (xR + i));
                    output -= (gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yT + j), (xL + i));
                    output += (1-gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yB + j), (xR + i));
                    output -= (1-gamma) * DIM3_INDEX(gradOutput, b, ch, y, x) * DIM3_INDEX(input1, b, ch, (yB + j), (xL + i));
                }
            }
        }

    }

    gradInput[index] = output;

}

void resample2d_kernel_forward(
    at::Tensor& input1, 
    at::Tensor& input2,
    at::Tensor& output, 
    int kernel_size,
    bool bilinear) {

    int n = output.numel();

    const long4 input1_size = make_long4(input1.size(0), input1.size(1), input1.size(2), input1.size(3));
    const long4 input1_stride = make_long4(input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3));

    const long4 input2_size = make_long4(input2.size(0), input2.size(1), input2.size(2), input2.size(3));
    const long4 input2_stride = make_long4(input2.stride(0), input2.stride(1), input2.stride(2), input2.stride(3));

    const long4 output_size = make_long4(output.size(0), output.size(1), output.size(2), output.size(3));
    const long4 output_stride = make_long4(output.stride(0), output.stride(1), output.stride(2), output.stride(3));

    // TODO: when atomicAdd gets resolved, change to AT_DISPATCH_FLOATING_TYPES_AND_HALF
//    AT_DISPATCH_FLOATING_TYPES(input1.type(), "resample_forward_kernel", ([&] {

        kernel_resample2d_update_output<float><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
            n,
            input1.data<float>(),
            input1_size,
            input1_stride, 
            input2.data<float>(),
            input2_size,
            input2_stride,
            output.data<float>(),
            output_size,
            output_stride,
            kernel_size,
            bilinear);

//    }));

        // TODO: ATen-equivalent check

       //    THCudaCheck(cudaGetLastError());

}

void resample2d_kernel_backward(
    at::Tensor& input1,
    at::Tensor& input2,
    at::Tensor& gradOutput,
    at::Tensor& gradInput1,
    at::Tensor& gradInput2,
    int kernel_size,
    bool bilinear) {

    int n = gradOutput.numel();

    const long4 input1_size = make_long4(input1.size(0), input1.size(1), input1.size(2), input1.size(3));
    const long4 input1_stride = make_long4(input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3));

    const long4 input2_size = make_long4(input2.size(0), input2.size(1), input2.size(2), input2.size(3));
    const long4 input2_stride = make_long4(input2.stride(0), input2.stride(1), input2.stride(2), input2.stride(3));

    const long4 gradOutput_size = make_long4(gradOutput.size(0), gradOutput.size(1), gradOutput.size(2), gradOutput.size(3));
    const long4 gradOutput_stride = make_long4(gradOutput.stride(0), gradOutput.stride(1), gradOutput.stride(2), gradOutput.stride(3));

    const long4 gradInput1_size = make_long4(gradInput1.size(0), gradInput1.size(1), gradInput1.size(2), gradInput1.size(3));
    const long4 gradInput1_stride = make_long4(gradInput1.stride(0), gradInput1.stride(1), gradInput1.stride(2), gradInput1.stride(3));

//    AT_DISPATCH_FLOATING_TYPES(input1.type(), "resample_backward_input1", ([&] {

        kernel_resample2d_backward_input1<float><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
            n, 
            input1.data<float>(), 
            input1_size,
            input1_stride,
            input2.data<float>(),
            input2_size, 
            input2_stride,
            gradOutput.data<float>(),
            gradOutput_size,
            gradOutput_stride,
            gradInput1.data<float>(),
            gradInput1_size,
            gradInput1_stride, 
            kernel_size,
            bilinear
        );

//    }));

    const long4 gradInput2_size = make_long4(gradInput2.size(0), gradInput2.size(1), gradInput2.size(2), gradInput2.size(3));
    const long4 gradInput2_stride = make_long4(gradInput2.stride(0), gradInput2.stride(1), gradInput2.stride(2), gradInput2.stride(3));

    n = gradInput2.numel();

//    AT_DISPATCH_FLOATING_TYPES(gradInput2.type(), "resample_backward_input2", ([&] {


        kernel_resample2d_backward_input2<float><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
            n, 
            input1.data<float>(), 
            input1_size, 
            input1_stride,
            input2.data<float>(), 
            input2_size,
            input2_stride,
            gradOutput.data<float>(),
            gradOutput_size,
            gradOutput_stride,
            gradInput2.data<float>(),
            gradInput2_size,
            gradInput2_stride,
            kernel_size,
            bilinear
       );

//    }));

    // TODO: Use the ATen equivalent to get last error

    //    THCudaCheck(cudaGetLastError());

}


================================================
FILE: dvs/flownet2/networks/resample2d_package/resample2d_kernel.cuh
================================================
#pragma once

#include <ATen/ATen.h>

void resample2d_kernel_forward(
    at::Tensor& input1,
    at::Tensor& input2,
    at::Tensor& output,
    int kernel_size,
    bool bilinear);

void resample2d_kernel_backward(
    at::Tensor& input1,
    at::Tensor& input2,
    at::Tensor& gradOutput,
    at::Tensor& gradInput1, 
    at::Tensor& gradInput2, 
    int kernel_size,
    bool bilinear);

================================================
FILE: dvs/flownet2/networks/resample2d_package/setup.py
================================================
#!/usr/bin/env python3
import os
import torch

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

cxx_args = ['-std=c++11']

nvcc_args = [
    '-gencode', 'arch=compute_50,code=sm_50',
    '-gencode', 'arch=compute_52,code=sm_52',
    '-gencode', 'arch=compute_60,code=sm_60',
    '-gencode', 'arch=compute_61,code=sm_61',
    '-gencode', 'arch=compute_70,code=sm_70',
    '-gencode', 'arch=compute_70,code=compute_70'
]

setup(
    name='resample2d_cuda',
    ext_modules=[
        CUDAExtension('resample2d_cuda', [
            'resample2d_cuda.cc',
            'resample2d_kernel.cu'
        ], extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args})
    ],
    cmdclass={
        'build_ext': BuildExtension
    })


================================================
FILE: dvs/flownet2/networks/submodules.py
================================================
# freda (todo) : 

import torch.nn as nn
import torch
import numpy as np 

def conv(batchNorm, in_planes, out_planes, kernel_size=3, stride=1):
    if batchNorm:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=False),
            nn.BatchNorm2d(out_planes),
            nn.LeakyReLU(0.1,inplace=True)
        )
    else:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=True),
            nn.LeakyReLU(0.1,inplace=True)
        )

def i_conv(batchNorm, in_planes, out_planes, kernel_size=3, stride=1, bias = True):
    if batchNorm:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=bias),
            nn.BatchNorm2d(out_planes),
        )
    else:
        return nn.Sequential(
            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=bias),
        )

def predict_flow(in_planes):
    return nn.Conv2d(in_planes,2,kernel_size=3,stride=1,padding=1,bias=True)

def deconv(in_planes, out_planes):
    return nn.Sequential(
        nn.ConvTranspose2d(in_planes, out_planes, kernel_size=4, stride=2, padding=1, bias=True),
        nn.LeakyReLU(0.1,inplace=True)
    )

class tofp16(nn.Module):
    def __init__(self):
        super(tofp16, self).__init__()

    def forward(self, input):
        return input.half()


class tofp32(nn.Module):
    def __init__(self):
        super(tofp32, self).__init__()

    def forward(self, input):
        return input.float()


def init_deconv_bilinear(weight):
    f_shape = weight.size()
    heigh, width = f_shape[-2], f_shape[-1]
    f = np.ceil(width/2.0)
    c = (2 * f - 1 - f % 2) / (2.0 * f)
    bilinear = np.zeros([heigh, width])
    for x in range(width):
        for y in range(heigh):
            value = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
            bilinear[x, y] = value
    weight.data.fill_(0.)
    for i in range(f_shape[0]):
        for j in range(f_shape[1]):
            weight.data[i,j,:,:] = torch.from_numpy(bilinear)


def save_grad(grads, name):
    def hook(grad):
        grads[name] = grad
    return hook

'''
def save_grad(grads, name):
    def hook(grad):
        grads[name] = grad
    return hook
import torch
from channelnorm_package.modules.channelnorm import ChannelNorm 
model = ChannelNorm().cuda()
grads = {}
a = 100*torch.autograd.Variable(torch.randn((1,3,5,5)).cuda(), requires_grad=True)
a.register_hook(save_grad(grads, 'a'))
b = model(a)
y = torch.mean(b)
y.backward()

'''


================================================
FILE: dvs/flownet2/run.sh
================================================
#!/bin/bash
python main.py --inference --model FlowNet2 --save_flow --inference_dataset Google \
	--inference_dataset_root ./../video \
	--resume ./FlowNet2_checkpoint.pth.tar \
	--inference_visualize


================================================
FILE: dvs/flownet2/run_release.sh
================================================
#!/bin/bash
python main.py --inference --model FlowNet2 --save_flow --inference_dataset Google \
	--inference_dataset_root ./../dataset_release/test \
	--resume ./FlowNet2_checkpoint.pth.tar \
	--inference_visualize

python main.py --inference --model FlowNet2 --save_flow --inference_dataset Google \
	--inference_dataset_root ./../dataset_release/training \
	--resume ./FlowNet2_checkpoint.pth.tar \
	--inference_visualize

================================================
FILE: dvs/flownet2/utils/__init__.py
================================================


================================================
FILE: dvs/flownet2/utils/flow_utils.py
================================================
import numpy as np
import matplotlib.pyplot as plt
import os.path

TAG_CHAR = np.array([202021.25], np.float32)

def readFlow(fn):
    """ Read .flo file in Middlebury format"""
    # Code adapted from:
    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy

    # WARNING: this will work on little-endian architectures (eg Intel x86) only!
    # print 'fn = %s'%(fn)
    with open(fn, 'rb') as f:
        magic = np.fromfile(f, np.float32, count=1)
        if 202021.25 != magic:
            print('Magic number incorrect. Invalid .flo file')
            return None
        else:
            w = np.fromfile(f, np.int32, count=1)
            h = np.fromfile(f, np.int32, count=1)
            # print 'Reading %d x %d flo file\n' % (w, h)
            data = np.fromfile(f, np.float32, count=2*int(w)*int(h))
            # Reshape data into 3D array (columns, rows, bands)
            # The reshape here is for visualization, the original code is (w,h,2)
            return np.resize(data, (int(h), int(w), 2))

def writeFlow(filename,uv,v=None):
    """ Write optical flow to file.
    
    If v is None, uv is assumed to contain both u and v channels,
    stacked in depth.
    Original code by Deqing Sun, adapted from Daniel Scharstein.
    """
    nBands = 2

    if v is None:
        assert(uv.ndim == 3)
        assert(uv.shape[2] == 2)
        u = uv[:,:,0]
        v = uv[:,:,1]
    else:
        u = uv

    assert(u.shape == v.shape)
    height,width = u.shape
    f = open(filename,'wb')
    # write the header
    f.write(TAG_CHAR)
    np.array(width).astype(np.int32).tofile(f)
    np.array(height).astype(np.int32).tofile(f)
    # arrange into matrix form
    tmp = np.zeros((height, width*nBands))
    tmp[:,np.arange(width)*2] = u
    tmp[:,np.arange(width)*2 + 1] = v
    tmp.astype(np.float32).tofile(f)
    f.close()


# ref: https://github.com/sampepose/flownet2-tf/
# blob/18f87081db44939414fc4a48834f9e0da3e69f4c/src/flowlib.py#L240
def visulize_flow_file(flow_filename, save_dir=None):
	flow_data = readFlow(flow_filename)
	img = flow2img(flow_data)
	# plt.imshow(img)
	# plt.show()
	if save_dir:
		idx = flow_filename.rfind("/") + 1
		plt.imsave(os.path.join(save_dir, "%s-vis.png" % flow_filename[idx:-4]), img)


def flow2img(flow_data):
	"""
	convert optical flow into color image
	:param flow_data:
	:return: color image
	"""
	# print(flow_data.shape)
	# print(type(flow_data))
	u = flow_data[:, :, 0]
	v = flow_data[:, :, 1]

	UNKNOW_FLOW_THRESHOLD = 1e7
	pr1 = abs(u) > UNKNOW_FLOW_THRESHOLD
	pr2 = abs(v) > UNKNOW_FLOW_THRESHOLD
	idx_unknown = (pr1 | pr2)
	u[idx_unknown] = v[idx_unknown] = 0

	# get max value in each direction
	maxu = -999.
	maxv = -999.
	minu = 999.
	minv = 999.
	maxu = max(maxu, np.max(u))
	maxv = max(maxv, np.max(v))
	minu = min(minu, np.min(u))
	minv = min(minv, np.min(v))

	rad = np.sqrt(u ** 2 + v ** 2)
	maxrad = max(-1, np.max(rad))
	u = u / maxrad + np.finfo(float).eps
	v = v / maxrad + np.finfo(float).eps

	img = compute_color(u, v)

	idx = np.repeat(idx_unknown[:, :, np.newaxis], 3, axis=2)
	img[idx] = 0

	return np.uint8(img)


def compute_color(u, v):
	"""
	compute optical flow color map
	:param u: horizontal optical flow
	:param v: vertical optical flow
	:return:
	"""

	height, width = u.shape
	img = np.zeros((height, width, 3))

	NAN_idx = np.isnan(u) | np.isnan(v)
	u[NAN_idx] = v[NAN_idx] = 0

	colorwheel = make_color_wheel()
	ncols = np.size(colorwheel, 0)

	rad = np.sqrt(u ** 2 + v ** 2)

	a = np.arctan2(-v, -u) / np.pi

	fk = (a + 1) / 2 * (ncols - 1) + 1

	k0 = np.floor(fk).astype(int)

	k1 = k0 + 1
	k1[k1 == ncols + 1] = 1
	f = fk - k0

	for i in range(0, np.size(colorwheel, 1)):
		tmp = colorwheel[:, i]
		col0 = tmp[k0 - 1] / 255
		col1 = tmp[k1 - 1] / 255
		col = (1 - f) * col0 + f * col1

		idx = rad <= 1
		col[idx] = 1 - rad[idx] * (1 - col[idx])
		notidx = np.logical_not(idx)

		col[notidx] *= 0.75
		img[:, :, i] = np.uint8(np.floor(255 * col * (1 - NAN_idx)))

	return img


def make_color_wheel():
	"""
	Generate color wheel according Middlebury color code
	:return: Color wheel
	"""
	RY = 15
	YG = 6
	GC = 4
	CB = 11
	BM = 13
	MR = 6

	ncols = RY + YG + GC + CB + BM + MR

	colorwheel = np.zeros([ncols, 3])

	col = 0

	# RY
	colorwheel[0:RY, 0] = 255
	colorwheel[0:RY, 1] = np.transpose(np.floor(255 * np.arange(0, RY) / RY))
	col += RY

	# YG
	colorwheel[col:col + YG, 0] = 255 - np.transpose(np.floor(255 * np.arange(0, YG) / YG))
	colorwheel[col:col + YG, 1] = 255
	col += YG

	# GC
	colorwheel[col:col + GC, 1] = 255
	colorwheel[col:col + GC, 2] = np.transpose(np.floor(255 * np.arange(0, GC) / GC))
	col += GC

	# CB
	colorwheel[col:col + CB, 1] = 255 - np.transpose(np.floor(255 * np.arange(0, CB) / CB))
	colorwheel[col:col + CB, 2] = 255
	col += CB

	# BM
	colorwheel[col:col + BM, 2] = 255
	colorwheel[col:col + BM, 0] = np.transpose(np.floor(255 * np.arange(0, BM) / BM))
	col += + BM

	# MR
	colorwheel[col:col + MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
	colorwheel[col:col + MR, 0] = 255

	return colorwheel


================================================
FILE: dvs/flownet2/utils/frame_utils.py
================================================
import numpy as np
from os.path import *
from imageio import imread
from . import flow_utils 

def read_gen(file_name):
    ext = splitext(file_name)[-1]
    if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg':
        im = imread(file_name)
        if im.shape[2] > 3:
            return im[:,:,:3]
        else:
            return im
    elif ext == '.bin' or ext == '.raw':
        return np.load(file_name)
    elif ext == '.flo':
        return flow_utils.readFlow(file_name).astype(np.float32)
    return []


================================================
FILE: dvs/flownet2/utils/param_utils.py
================================================
import torch
import torch.nn as nn
import numpy as np

def parse_flownetc(modules, weights, biases):
    keys = [
    'conv1',
    'conv2',
    'conv3',
    'conv_redir',
    'conv3_1',
    'conv4',
    'conv4_1',
    'conv5',
    'conv5_1',
    'conv6',
    'conv6_1',
    
    'deconv5',
    'deconv4',
    'deconv3',
    'deconv2',
    
    'Convolution1',
    'Convolution2',
    'Convolution3',
    'Convolution4',
    'Convolution5',

    'upsample_flow6to5',
    'upsample_flow5to4',
    'upsample_flow4to3',
    'upsample_flow3to2',
    
    ]
    i = 0
    for m in modules:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            weight = weights[keys[i]].copy()
            bias = biases[keys[i]].copy()
            if keys[i] == 'conv1':
                m.weight.data[:,:,:,:] = torch.from_numpy(np.flip(weight, axis=1).copy())
                m.bias.data[:] = torch.from_numpy(bias)
            else:
                m.weight.data[:,:,:,:] = torch.from_numpy(weight)
                m.bias.data[:] = torch.from_numpy(bias)                    

            i = i + 1
    return

def parse_flownets(modules, weights, biases, param_prefix='net2_'):
    keys = [
    'conv1',
    'conv2',
    'conv3',
    'conv3_1',
    'conv4',
    'conv4_1',
    'conv5',
    'conv5_1',
    'conv6',
    'conv6_1',
    
    'deconv5',
    'deconv4',
    'deconv3',
    'deconv2',
    
    'predict_conv6',
    'predict_conv5',
    'predict_conv4',
    'predict_conv3',
    'predict_conv2',

    'upsample_flow6to5',
    'upsample_flow5to4',
    'upsample_flow4to3',
    'upsample_flow3to2',
    ]
    for i, k in enumerate(keys):
        if 'upsample' in k:
            keys[i] = param_prefix + param_prefix + k
        else:
            keys[i] = param_prefix + k
    i = 0
    for m in modules:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            weight = weights[keys[i]].copy()
            bias = biases[keys[i]].copy()
            if keys[i] == param_prefix+'conv1':
                m.weight.data[:,0:3,:,:] = torch.from_numpy(np.flip(weight[:,0:3,:,:], axis=1).copy())
                m.weight.data[:,3:6,:,:] = torch.from_numpy(np.flip(weight[:,3:6,:,:], axis=1).copy())
                m.weight.data[:,6:9,:,:] = torch.from_numpy(np.flip(weight[:,6:9,:,:], axis=1).copy())
                m.weight.data[:,9::,:,:] = torch.from_numpy(weight[:,9:,:,:].copy())
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            else:
                m.weight.data[:,:,:,:] = torch.from_numpy(weight)
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            i = i + 1
    return

def parse_flownetsonly(modules, weights, biases, param_prefix=''):
    keys = [
    'conv1',
    'conv2',
    'conv3',
    'conv3_1',
    'conv4',
    'conv4_1',
    'conv5',
    'conv5_1',
    'conv6',
    'conv6_1',
    
    'deconv5',
    'deconv4',
    'deconv3',
    'deconv2',
    
    'Convolution1',
    'Convolution2',
    'Convolution3',
    'Convolution4',
    'Convolution5',

    'upsample_flow6to5',
    'upsample_flow5to4',
    'upsample_flow4to3',
    'upsample_flow3to2',
    ]
    for i, k in enumerate(keys):
        if 'upsample' in k:
            keys[i] = param_prefix + param_prefix + k
        else:
            keys[i] = param_prefix + k
    i = 0
    for m in modules:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            weight = weights[keys[i]].copy()
            bias = biases[keys[i]].copy()
            if keys[i] == param_prefix+'conv1':
                # print ("%s :"%(keys[i]), m.weight.size(), m.bias.size(), tf_w[keys[i]].shape[::-1])
                m.weight.data[:,0:3,:,:] = torch.from_numpy(np.flip(weight[:,0:3,:,:], axis=1).copy())
                m.weight.data[:,3:6,:,:] = torch.from_numpy(np.flip(weight[:,3:6,:,:], axis=1).copy())
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            else:
                m.weight.data[:,:,:,:] = torch.from_numpy(weight)
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            i = i + 1
    return

def parse_flownetsd(modules, weights, biases, param_prefix='netsd_'):
    keys = [
    'conv0',
    'conv1',
    'conv1_1',
    'conv2',
    'conv2_1',
    'conv3',
    'conv3_1',
    'conv4',
    'conv4_1',
    'conv5',
    'conv5_1',
    'conv6',
    'conv6_1',
    
    'deconv5',
    'deconv4',
    'deconv3',
    'deconv2',

    'interconv5',
    'interconv4',
    'interconv3',
    'interconv2',
    
    'Convolution1',
    'Convolution2',
    'Convolution3',
    'Convolution4',
    'Convolution5',

    'upsample_flow6to5',
    'upsample_flow5to4',
    'upsample_flow4to3',
    'upsample_flow3to2',
    ]
    for i, k in enumerate(keys):
        keys[i] = param_prefix + k

    i = 0
    for m in modules:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            weight = weights[keys[i]].copy()
            bias = biases[keys[i]].copy()
            if keys[i] == param_prefix+'conv0':
                m.weight.data[:,0:3,:,:] = torch.from_numpy(np.flip(weight[:,0:3,:,:], axis=1).copy())
                m.weight.data[:,3:6,:,:] = torch.from_numpy(np.flip(weight[:,3:6,:,:], axis=1).copy())
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            else:
                m.weight.data[:,:,:,:] = torch.from_numpy(weight)
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            i = i + 1

    return

def parse_flownetfusion(modules, weights, biases, param_prefix='fuse_'):
    keys = [
    'conv0',
    'conv1',
    'conv1_1',
    'conv2',
    'conv2_1',

    'deconv1',
    'deconv0',

    'interconv1',
    'interconv0',
    
    '_Convolution5',
    '_Convolution6',
    '_Convolution7',

    'upsample_flow2to1',
    'upsample_flow1to0',
    ]
    for i, k in enumerate(keys):
        keys[i] = param_prefix + k

    i = 0
    for m in modules:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            weight = weights[keys[i]].copy()
            bias = biases[keys[i]].copy()
            if keys[i] == param_prefix+'conv0':
                m.weight.data[:,0:3,:,:] = torch.from_numpy(np.flip(weight[:,0:3,:,:], axis=1).copy())
                m.weight.data[:,3::,:,:] = torch.from_numpy(weight[:,3:,:,:].copy())
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            else:
                m.weight.data[:,:,:,:] = torch.from_numpy(weight)
                if m.bias is not None:
                    m.bias.data[:] = torch.from_numpy(bias)
            i = i + 1

    return


================================================
FILE: dvs/flownet2/utils/tools.py
================================================
# freda (todo) : 

import os, time, sys, math
import subprocess, shutil
from os.path import *
import numpy as np
from inspect import isclass
from pytz import timezone
from datetime import datetime
import inspect
import torch

def datestr():
    pacific = timezone('US/Pacific')
    now = datetime.now(pacific)
    return '{}{:02}{:02}_{:02}{:02}'.format(now.year, now.month, now.day, now.hour, now.minute)

def module_to_dict(module, exclude=[]):
        return dict([(x, getattr(module, x)) for x in dir(module)
                     if isclass(getattr(module, x))
                     and x not in exclude
                     and getattr(module, x) not in exclude])

class TimerBlock: 
    def __init__(self, title):
        print(("{}".format(title)))

    def __enter__(self):
        self.start = time.clock()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.end = time.clock()
        self.interval = self.end - self.start

        if exc_type is not None:
            self.log("Operation failed\n")
        else:
            self.log("Operation finished\n")


    def log(self, string):
        duration = time.clock() - self.start
        units = 's'
        if duration > 60:
            duration = duration / 60.
            units = 'm'
        print(("  [{:.3f}{}] {}".format(duration, units, string)))
    
    def log2file(self, fid, string):
        fid = open(fid, 'a')
        fid.write("%s\n"%(string))
        fid.close()

def add_arguments_for_module(parser, module, argument_for_class, default, skip_params=[], parameter_defaults={}):
    argument_group = parser.add_argument_group(argument_for_class.capitalize())

    module_dict = module_to_dict(module)
    argument_group.add_argument('--' + argument_for_class, type=str, default=default, choices=list(module_dict.keys()))
    
    args, unknown_args = parser.parse_known_args()
    class_obj = module_dict[vars(args)[argument_for_class]]

    argspec = inspect.getargspec(class_obj.__init__)

    defaults = argspec.defaults[::-1] if argspec.defaults else None

    args = argspec.args[::-1]
    for i, arg in enumerate(args):
        cmd_arg = '{}_{}'.format(argument_for_class, arg)
        if arg not in skip_params + ['self', 'args']:
            if arg in list(parameter_defaults.keys()):
                argument_group.add_argument('--{}'.format(cmd_arg), type=type(parameter_defaults[arg]), default=parameter_defaults[arg])
            elif (defaults is not None and i < len(defaults)):
                argument_group.add_argument('--{}'.format(cmd_arg), type=type(defaults[i]), default=defaults[i])
            else:
                print(("[Warning]: non-default argument '{}' detected on class '{}'. This argument cannot be modified via the command line"
                        .format(arg, module.__class__.__name__)))
            # We don't have a good way of dealing with inferring the type of the argument
            # TODO: try creating a custom action and using ast's infer type?
            # else:
            #     argument_group.add_argument('--{}'.format(cmd_arg), required=True)

def kwargs_from_args(args, argument_for_class):
    argument_for_class = argument_for_class + '_'
    return {key[len(argument_for_class):]: value for key, value in list(vars(args).items()) if argument_for_class in key and key != argument_for_class + 'class'}

def format_dictionary_of_losses(labels, values):
    try:
        string = ', '.join([('{}: {:' + ('.3f' if value >= 0.001 else '.1e') +'}').format(name, value) for name, value in zip(labels, values)])
    except (TypeError, ValueError) as e:
        print((list(zip(labels, values))))
        string = '[Log Error] ' + str(e)

    return string


class IteratorTimer():
    def __init__(self, iterable):
        self.iterable = iterable
        self.iterator = self.iterable.__iter__()

    def __iter__(self):
        return self

    def __len__(self):
        return len(self.iterable)

    def __next__(self):
        start = time.time()
        n = next(self.iterator)
        self.last_duration = (time.time() - start)
        return n

    next = __next__

def gpumemusage():
    gpu_mem = subprocess.check_output("nvidia-smi | grep MiB | cut -f 3 -d '|'", shell=True).replace(' ', '').replace('\n', '').replace('i', '')
    all_stat = [float(a) for a in gpu_mem.replace('/','').split('MB')[:-1]]

    gpu_mem = ''
    for i in range(len(all_stat)/2):
        curr, tot = all_stat[2*i], all_stat[2*i+1]
        util = "%1.2f"%(100*curr/tot)+'%'
        cmem = str(int(math.ceil(curr/1024.)))+'GB'
        gmem = str(int(math.ceil(tot/1024.)))+'GB'
        gpu_mem += util + '--' + join(cmem, gmem) + ' '
    return gpu_mem


def update_hyperparameter_schedule(args, epoch, global_iteration, optimizer):
    if args.schedule_lr_frequency > 0:
        for param_group in optimizer.param_groups:
            if (global_iteration + 1) % args.schedule_lr_frequency == 0:
                param_group['lr'] /= float(args.schedule_lr_fraction)
                param_group['lr'] = float(np.maximum(param_group['lr'], 0.000001))

def save_checkpoint(state, is_best, path, prefix, filename='checkpoint.pth.tar'):
    prefix_save = os.path.join(path, prefix)
    name = prefix_save + '_' + filename
    torch.save(state, name)
    if is_best:
        shutil.copyfile(name, prefix_save + '_model_best.pth.tar')


================================================
FILE: dvs/gyro/__init__.py
================================================
from .gyro_function import (
    GetGyroAtTimeStamp,
    QuaternionProduct,
    QuaternionReciprocal,
    ConvertQuaternionToAxisAngle,
    FindOISAtTimeStamp,
    GetMetadata,
    GetProjections,
    GetVirtualProjection,
    GetForwardGrid,
    CenterZoom,
    GetWarpingFlow,
    torch_norm_quat,
    torch_QuaternionProduct, 
    torch_QuaternionReciprocal,
    torch_GetVirtualProjection,
    get_static,
    torch_GetForwardGrid,
    torch_GetWarpingFlow,
    train_GetGyroAtTimeStamp,
    train_ConvertQuaternionToAxisAngle,
    ConvertAxisAngleToQuaternion,
    torch_ConvertAxisAngleToQuaternion,
    torch_ConvertQuaternionToAxisAngle,
    ConvertAxisAngleToQuaternion_no_angle,
    ConvertQuaternionToAxisAngle_no_angle,
    torch_GetHomographyTransformFromProjections,
    torch_ApplyTransform,
    norm_quat,
    SlerpWithDefault
    )
from .gyro_io import (
    LoadGyroData, 
    LoadOISData, 
    LoadFrameData, 
    LoadStabResult,
    get_grid, 
    get_rotations, 
    visual_rotation
    )

================================================
FILE: dvs/gyro/gyro_function.py
================================================
import numpy as np
from numpy import linalg as LA
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable

def get_static(height = 1080, width = 1920, ratio = 0.1):
    static_options = {}
    static_options["active_array_width"] = 4032
    static_options["active_array_height"] = 3024
    static_options["crop_window_width"] = 4032
    static_options["crop_window_height"] = 2272
    static_options["num_grid_rows"] = 12
    static_options["num_grid_cols"] = 12
    static_options["dim_homography"] = 9
    static_options["width"] = width  # frame width.
    static_options["height"] = height # frame height
    # static_options["fov"] = 1.27 # sensor_width/sensor_focal_length
    static_options["cropping_ratio"] = 0.0 #ratio # normalized cropping ratio at each side. 
    return static_options

# Quaternion: [x, y, z, w]

def norm_quat(quat):
    norm_quat = LA.norm(quat)   
    if norm_quat > 1e-6:
        quat = quat / norm_quat   
        #     [0 norm_quat norm_quat - 1e-6]
    else:
        # print('bad len for Reciprocal')
        quat = np.array([0,0,0,1])
    return quat

def torch_norm_quat(quat, USE_CUDA = True):
    # Method 1:
    batch_size = quat.size()[0]
    quat_out = Variable(torch.zeros((batch_size, 4), requires_grad=True))
    if USE_CUDA == True:
        quat_out = quat_out.cuda()
    for i in range(batch_size):
        norm_quat = torch.norm(quat[i])   
        if norm_quat > 1e-6:        
            quat_out[i] = quat[i] / norm_quat  
            #     [0 norm_quat norm_quat - 1e-6]
        else:
            quat_out[i,:3] = quat[i,:3] * 0
            quat_out[i,3] = quat[i,3] / quat[i,3]

    # Method 2:
    # quat = quat / (torch.unsqueeze(torch.norm(quat, dim = 1), 1) + 1e-6) # check norm
    return quat_out

def ConvertAxisAngleToQuaternion(axis, angle):
    if LA.norm(axis) > 1e-6 and angle > 1e-6: 
        axis = axis/LA.norm(axis)  
    half_angle = angle*0.5  
    sin_half_angle = np.sin(half_angle)
    quat = np.array([sin_half_angle* axis[0], sin_half_angle* axis[1], sin_half_angle* axis[2], np.cos(half_angle)])

    return norm_quat(quat)

def ConvertAxisAngleToQuaternion_no_angle(axis):
    angle = LA.norm(axis)  
    if LA.norm(axis) > 1e-6: 
        axis = axis/LA.norm(axis)  
    half_angle = angle*0.5  
    sin_half_angle = np.sin(half_angle)
    quat = np.array([sin_half_angle* axis[0], sin_half_angle* axis[1], sin_half_angle* axis[2], np.cos(half_angle)])

    return norm_quat(quat)

def torch_ConvertAxisAngleToQuaternion(axis, USE_CUDA = True):
    batch_size = axis.size()[0]

    angle = torch.norm(axis[:,:3], dim = 1)

    half_angle = angle * 0.5 
    sin_half_angle = torch.sin(half_angle)
    quats = Variable(torch.zeros((batch_size, 4), requires_grad=True))
    norm_axis = axis[:,:3] * 1
    if USE_CUDA:
        quats = quats.cuda()
    for i in range(batch_size):
        if angle[i] > 1e-6:
            norm_axis[i] = axis[i,:3]/angle[i]
    quats[:, :3] = sin_half_angle * norm_axis
    quats[:, 3] = torch.cos(half_angle)
    return torch_norm_quat(quats)

def ConvertQuaternionToAxisAngle(quat):
    quat = quat/LA.norm(quat)   
    axis_norm = LA.norm(quat[0:3])
    axis = np.array([0.0, 0.0, 0.0])
    if axis_norm < 1e-6:
        angle = 0   
    else:
        axis_norm_reciprocal = 1/axis_norm   
        axis[0] = quat[0] * axis_norm_reciprocal   
        axis[1] = quat[1] * axis_norm_reciprocal   
        axis[2] = quat[2] * axis_norm_reciprocal   
        angle = 2 * np.arccos(quat[3])
    return [axis, angle]

def ConvertQuaternionToAxisAngle_no_angle(quat):
    quat = quat/LA.norm(quat)   
    axis_norm = LA.norm(quat[0:3])
    axis = np.array([0.0, 0.0, 0.0])  
    if axis_norm > 1e-6:
        axis_norm_reciprocal = 1 / axis_norm * 2 *  np.arccos(quat[3])
        axis[0] = quat[0] * axis_norm_reciprocal   
        axis[1] = quat[1] * axis_norm_reciprocal   
        axis[2] = quat[2] * axis_norm_reciprocal   
    return axis

def torch_ConvertQuaternionToAxisAngle(quat, USE_CUDA = True):
    batch_size = quat.size()[0]
    axis_angle = Variable(torch.zeros((batch_size, 4), requires_grad=True))
    if USE_CUDA:
        axis_angle = axis_angle.cuda()
    for i in range(batch_size): 
        axis_norm = torch.norm(quat[i, 0:3])
        if axis_norm > 1e-6:
            axis_norm_reciprocal = 1/axis_norm  * 2 * torch.acos(quat[i,3])
            axis_angle[i,0] = quat[i,0] * axis_norm_reciprocal   
            axis_angle[i,1] = quat[i,1] * axis_norm_reciprocal   
            axis_angle[i,2] = quat[i,2] * axis_norm_reciprocal   
    return axis_angle

def train_ConvertQuaternionToAxisAngle(quat):
    out = np.zeros(4)
    out[:3] = ConvertQuaternionToAxisAngle_no_angle(quat)
    return out

def AngularVelocityToQuat(angular_v, dt):
    length = LA.norm(angular_v)  
    if length < 1e-6:
        angular_v = np.array([1, 0, 0])  
        print('bad length')
    else:
        angular_v = angular_v/length  
    quat = ConvertAxisAngleToQuaternion(angular_v, length*dt) 
    return quat

def QuaternionProduct(q1, q2):
    x1 = q1[0]  
    y1 = q1[1]   
    z1 = q1[2]   
    w1 = q1[3]   

    x2 = q2[0]  
    y2 = q2[1]  
    z2 = q2[2]  
    w2 = q2[3]  

    quat = np.zeros(4)
    quat[3] =  w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2  
    quat[0] =  w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2  
    quat[1] = w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2  
    quat[2] = w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2 

    return norm_quat(quat)

def torch_QuaternionProduct(q1, q2, USE_CUDA = True):
    x1 = q1[:,0]  
    y1 = q1[:,1]   
    z1 = q1[:,2]   
    w1 = q1[:,3]   

    x2 = q2[:,0]  
    y2 = q2[:,1]  
    z2 = q2[:,2]  
    w2 = q2[:,3]  

    batch_size = q1.size()[0]
    quat = Variable(torch.zeros((batch_size, 4), requires_grad=True))
    if USE_CUDA == True:
        quat = quat.cuda()
    
    quat[:,3] =  w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2  
    quat[:,0] =  w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2  
    quat[:,1] = w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2  
    quat[:,2] = w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2  

    quat = torch_norm_quat(quat)

    return quat

def ProcessGyroRotation(gyro_data):
    num_inputs = np.shape(gyro_data)[0]
    quats = np.zeros((num_inputs, 4))  
    quats[0,:] = np.array([0, 0, 0, 1])
    for i in range(1, num_inputs):
        dt = (gyro_data[i, 0] - gyro_data[i-1, 0])*1e-9  
        quat = AngularVelocityToQuat(gyro_data[i, 1:4], dt)  
        quats[i,:] = QuaternionProduct(quat, quats[i-1,:])   # R_t = delta R_t * R_t-1
        quats[i,:] = quats[i,:] / LA.norm(quats[i,:]) 
    return quats 

def QuaternionReciprocal(q):
    quat = np.array([-q[0], -q[1], -q[2], q[3]])  
    return norm_quat(quat)

def torch_QuaternionReciprocal(q,  USE_CUDA = True):
    quat = torch.cat((-q[:,0:1], -q[:,1:2], -q[:,2:3], q[:,3:]), dim = 1) 
    batch_size = quat.size()[0]

    quat = torch_norm_quat(quat)
    return quat

def ProcessGyroData(gyro_data):
    quats = ProcessGyroRotation(gyro_data) 
    size = np.shape(gyro_data)[0]
    axis_dif = np.zeros((size,3)) 
    for i in range(1, size):
        quat_dif = QuaternionProduct(quats[i,:], QuaternionReciprocal(quats[i-1,:]))  
        [axis_dif_cur, angles_cur] = ConvertQuaternionToAxisAngle(quat_dif)  
        axis_dif[i,:] = axis_dif_cur*angles_cur  
    return [axis_dif, quats]


def SlerpWithDefault(q1, q2, t, q_default):
    t = max(min(t, 1.0), 0.0) 
    kEpsilon = 1e-6 
    kSlerpLinearThresh = 0.9995 
    
    q1 = q1/LA.norm(q1) 
    q2 = q2/LA.norm(q2) 

    if t < kEpsilon:
        q3 = q1 
        return q3
    elif t > 1-kEpsilon:
        q3 = q2 
        return q3

    dot_prodcut = np.sum(q1*q2) 

    if abs(dot_prodcut) >= 1:
        q3= q_default 
        return q3
    elif abs(dot_prodcut) > kSlerpLinearThresh:
        q3 = q1*(1-t) + q2*t 
        q3 = q3/LA.norm(q3)     
        return q3

    sign = 1 
    if dot_prodcut < 0:
        sign = -1 
        dot_prodcut = -dot_prodcut 

    theta = np.arccos(dot_prodcut) 
    sin_theta = np.sin(theta) 
    inv_sin_theta = 1.0 / sin_theta 
    coeff1 = np.sin((1.0 - t) * theta) * inv_sin_theta 
    coeff2 = sign * np.sin(t * theta) * inv_sin_theta 
    q3 = q1 * coeff1 + q2 * coeff2 
    return q3


def GetGyroAtTimeStamp(gyro_data, timestamp):
    z = np.array([0,0,0,1])  
    if len(gyro_data) >= 2 and (not(timestamp < gyro_data[0,0] or timestamp > gyro_data[-1, 0])):
        ind = np.where(gyro_data[:,0] >= timestamp)
        ind = np.squeeze( ind, axis = 0)
        if gyro_data[ind[0], 0] == timestamp:
            z = gyro_data[ind[0],1:]
        else:
            start_index = ind[0] -1 
            end_index = ind[0] 
            ratio = (timestamp - gyro_data[start_index,0])/(gyro_data[end_index,0]-gyro_data[start_index,0])
            z = SlerpWithDefault(gyro_data[start_index,1:], gyro_data[end_index, 1:], ratio, gyro_data[start_index,1:]) 
    z = z / (LA.norm(z) + 1e-6)
    return z

def train_GetGyroAtTimeStamp(gyro_data, timestamp, check = False):
    if len(gyro_data) >= 2 and (not(timestamp < gyro_data[0,0] or timestamp > gyro_data[-1, 0])):
        ind = np.where(gyro_data[:,0] >= timestamp)
        ind = np.squeeze( ind, axis = 0)
        if gyro_data[ind[0], 0] == timestamp:
            z = gyro_data[ind[0],1:]
        else:
            start_index = ind[0] -1 
            end_index = ind[0] 
            ratio = (timestamp - gyro_data[start_index,0])/(gyro_data[end_index,0]-gyro_data[start_index,0])
            z = SlerpWithDefault(gyro_data[start_index,1:], gyro_data[end_index, 1:], ratio, gyro_data[start_index,1:]) 
        return z / (LA.norm(z) + 1e-6)
    if check:
        print("bad value")
    return None

def FindOISAtTimeStamp(ois_log, time):
    ois_time = ois_log[:,2] 
    if time <= ois_time[0]:
        ois_data = ois_log[0, 0:2] 
    elif time > ois_time[-1]:
        ois_data = ois_log[-1, 0:2]
    else:
        ind = np.where(ois_time >= time)
        ind = np.squeeze( ind, axis = 0)
        first_ind = ind[0]
        if ois_time[first_ind] == ind[0]:
            ois_data = ois_log[first_ind, 0:2]
        else:
            cur_time = ois_time[first_ind] 
            last_timestamp = ois_time[first_ind - 1]
            ratio = (time - last_timestamp) / (cur_time - last_timestamp) 
            ois_data = ois_log[first_ind - 1,0:2] * (1-ratio) + ois_log[first_ind,0:2]*ratio 

    return ois_data

def GetMetadata(frame_data, frame_index, result_poses = {} ):
    # global static_options
    # We can just use 1.27 as fov and virtual fov for videos in the data set.
    metadata = {}
    metadata["frame_id"] = frame_index
    metadata["timestamp_ns"]  = frame_data[frame_index, 0]
    metadata["timestamp_ois_ns"]  = frame_data[frame_index, 4]
    metadata["rs_time_ns"]  = frame_data[frame_index, 3]
    if "real fov" in result_poses:
        metadata["fov"] = result_poses['real fov'][frame_index,:] 
    else:
        metadata["fov"] = 1.27
    if "virtual fov" in result_poses:
        metadata["virtual_fov"] = result_poses['virtual fov'][frame_index,:] 
    else:
        metadata["virtual_fov"] = 1.27

    return metadata

def GetProjections(static_options, metadata, quats_data, ois_data,  no_shutter = False):
    num_rows = static_options["num_grid_rows"]
    real_projections = []
    for i in range(num_rows):
        if no_shutter:
            timestmap_ns = metadata["timestamp_ns"] + metadata["rs_time_ns"] * 0.5
            timestamp_ois_ns = metadata["timestamp_ois_ns"] + metadata["rs_time_ns"] * 0.5
        else:
            timestmap_ns = metadata["timestamp_ns"] + metadata["rs_time_ns"] * i / (num_rows-1)
            timestamp_ois_ns = metadata["timestamp_ois_ns"] + metadata["rs_time_ns"] * i / (num_rows-1)
        real_projections.append(GetRealProjection(
            static_options, quats_data, ois_data, metadata["fov"], timestmap_ns, timestamp_ois_ns))
    return real_projections

def GetRealProjection(static_options, quats_data, ois_data, fov, timestamp_ns, timestamp_ois_ns):
    quat = GetGyroAtTimeStamp(quats_data, timestamp_ns)
    ois_offset = FindOISAtTimeStamp(ois_data, timestamp_ois_ns) 
    # ois is w.r.t. active array size, thus we need to convert it to normalzied space.

    ois_offset = np.array(ois_offset) / np.array([static_options["crop_window_width"], static_options["crop_window_height"]])
    
    projection = GetProjectionHomography(quat, fov, ois_offset, static_options["width"], static_options["height"])
    return projection

def GetProjectionHomography(rot, fov, offset, width, height):
    # rot: rotation in quaternion
    # fov: sensor_width / focal_length.
    # offset: additional ois offset at normalized domain.
    # width/height: frame size.
    focal_length = width / fov
    rotation = ConvertQuaternionToRotationMatrix(rot)
    intrinsics = GetIntrinsics(focal_length, offset, width, height)
    projection_homography = np.matmul(intrinsics, rotation)
    return projection_homography

def torch_GetProjectionHomography(rot, fov, width, height, USE_CUDA = True):
    # rot: rotation in quaternion
    # fov: sensor_width / focal_length.
    # offset: additional ois offset at normalized domain.
    # width/height: frame size.
    focal_length = width / fov
    rotation = torch_ConvertQuaternionToRotationMatrix(rot)
    batch_size = rotation.size()[0]
    offset = np.array([0,0])
    intrinsics = GetIntrinsics(focal_length, offset, width, height)
    intrinsics = torch.Tensor(np.repeat(np.expand_dims(intrinsics, axis = 0), batch_size, axis = 0))
    if USE_CUDA == True:
        intrinsics = intrinsics.cuda()
    projection_homography = torch.matmul(intrinsics, rotation)
    return projection_homography

def ConvertQuaternionToRotationMatrix(quat):
    x = quat[0]
    y = quat[1]
    z = quat[2]
    w = quat[3]
    rotation = np.zeros(9)
    rotation[0] = 1 - 2 * y * y - 2 * z * z
    rotation[1] = 2 * x * y - 2 * z * w
    rotation[2] = 2 * x * z + 2 * y * w
    rotation[3] = 2 * x * y + 2 * z * w
    rotation[4] = 1 - 2 * x * x - 2 * z * z
    rotation[5] = 2 * y * z - 2 * x * w
    rotation[6] = 2 * x * z - 2 * y * w
    rotation[7] = 2 * y * z + 2 * x * w
    rotation[8] = 1 - 2 * x * x - 2 * y * y
    rotation = np.reshape(rotation, (3, 3)) # Note reshape is different with matlab
    return rotation

def torch_ConvertQuaternionToRotationMatrix(quat, USE_CUDA = True):
    x = quat[:,0]
    y = quat[:,1]
    z = quat[:,2]
    w = quat[:,3]

    batch_size = quat.size()[0]
    rotation = Variable(torch.zeros((batch_size, 9), requires_grad=True))
    if USE_CUDA == True:
        rotation = rotation.cuda()

    rotation[:,0] = 1 - 2 * y * y - 2 * z * z
    rotation[:,1] = 2 * x * y - 2 * z * w
    rotation[:,2] = 2 * x * z + 2 * y * w
    rotation[:,3] = 2 * x * y + 2 * z * w
    rotation[:,4] = 1 - 2 * x * x - 2 * z * z
    rotation[:,5] = 2 * y * z - 2 * x * w
    rotation[:,6] = 2 * x * z - 2 * y * w
    rotation[:,7] = 2 * y * z + 2 * x * w
    rotation[:,8] = 1 - 2 * x * x - 2 * y * y
    rotation = rotation.view(batch_size, 3, 3) # Note reshape is different with matlab
    return rotation

def ConvertRotationMatrixToQuaternion(m):
    tr = m[0,0] + m[1,1] + m[2,2]
    if tr > 0 :
        S = 2 * (tr+1.0)**0.5
        qw = 0.25 * S
        qx = (m[2,1] - m[1,2]) / S
        qy = (m[0,2] - m[2,0]) / S
        qz = (m[1,0] - m[0,1]) / S
    elif m[0,0] > m[1,1] and m[0,0] > m[2,2]:
        S = 2* (1.0 + m[0,0] - m[1,1] - m[2,2]) ** 0.5
        qw = (m[2,1] - m[1,2]) / S
        qx = 0.25 * S
        qy = (m[0,1] + m[1,0]) / S
        qz = (m[0,2] + m[2,0]) / S
    elif m[1,1] > m[2,2]: 
        S = 2* (1.0 - m[0,0] + m[1,1] - m[2,2]) ** 0.5
        qw = (m[0,2] - m[2,0]) / S
        qx = (m[0,1] + m[1,0]) / S
        qy = 0.25 * S
        qz = (m[1,2] + m[2,1]) / S
    else: 
        S = 2* (1.0 - m[0,0] - m[1,1] + m[2,2]) ** 0.5
        qw = (m[1,0] - m[0,1]) / S
        qx = (m[0,2] + m[2,0]) / S
        qy = (m[1,2] + m[2,1]) / S
        qz = 0.25 * S
    return np.array([qx,qy,qz,qw])

def GetIntrinsics(focal_length, offset, width, height):
    intrinsics = [
        [float(focal_length), 0.0, 0.5*(width-1)+offset[0]*width], 
        [0.0, float(focal_length), 0.5*(height-1)+offset[1]*height], 
        [0.0, 0.0, 1.0]
        ]
    return np.array(intrinsics)


def GetVirtualProjection(static_options, result_pose, metadata, frame_index):
    # debug only, for getting results and references for comparisons.
    quat = result_pose['virtual pose'][frame_index,:]
    if 'vitual lens offset' in result_pose:
        virutal_lens_offset = result_pose['vitual lens offset'][frame_index,:]
    else:
        virutal_lens_offset = np.array([0,0])
    virtual_projection = GetProjectionHomography(
        quat, metadata["virtual_fov"], virutal_lens_offset, static_options["width"], static_options["height"])
    return virtual_projection

def torch_GetVirtualProjection(static_options, quat, virtual_fov = 1.27):
    virtual_projection = torch_GetProjectionHomography(
        quat, virtual_fov, static_options["width"], static_options["height"])
    return virtual_projection


def GetForwardGrid(static_options, real_projections, virtual_projection):
    # real_projections: a set of 3x3 projections.
    # virtual_projection: a single 3x3 projection.

    grid = np.zeros((4, static_options["num_grid_cols"], static_options["num_grid_rows"]))
    width = static_options["width"]
    height = static_options["height"]

    row_step = 1/ (static_options["num_grid_rows"] - 1)
    col_step = 1/ (static_options["num_grid_cols"] - 1)

    for i in range(static_options["num_grid_rows"]):
        transform = GetHomographyTransformFromProjections(real_projections[i], virtual_projection)
        v = i * row_step
        for j in range(static_options["num_grid_cols"]):
            u = j * col_step
            point = np.array([u * width, v * height, 1]).T
            warped_point = ApplyTransform(transform, point)
            warped_point = warped_point / np.array([width, height, 1]) # normalize
            grid[:, j, i] = np.array([warped_point[0], warped_point[1], u, v])
    return grid

def torch_GetForwardGrid(static_options, real_projections, virtual_projection, USE_CUDA = True):
    # real_projections: a set of 3x3 projections.
    # virtual_projection: a single 3x3 projection.
    batch_size = real_projections.size()[0]

    grid = torch.zeros((batch_size, 4, static_options["num_grid_cols"], static_options["num_grid_rows"]))
    if USE_CUDA:
        grid = grid.cuda()
    width = static_options["width"]
    height = static_options["height"]

    row_step = 1/ (static_options["num_grid_rows"] - 1)
    col_step = 1/ (static_options["num_grid_cols"] - 1)

    for i in range(static_options["num_grid_rows"]):
        transform = torch_GetHomographyTransformFromProjections(real_projections[:, i], virtual_projection)
        v = i * row_step
        for j in range(static_options["num_grid_cols"]):
            u = j * col_step
            point = torch.Tensor([u * width, v * height, 1])
            norm = torch.Tensor([width, height, 1])
            if USE_CUDA == True:
                point = point.cuda()
                norm = norm.cuda()
            warped_point = torch_ApplyTransform(transform, point)
            warped_point = warped_point / norm # normalize
            grid[:, 0, j, i] = warped_point[:,0]
            grid[:, 1, j, i] = warped_point[:,1]
            grid[:, 2, j, i] = u
            grid[:, 3, j, i] = v
    return grid

def GetWarpingFlow(real_projections_src, real_projections_dst, num_rows, num_cols, frame_width, frame_height):
    # num_rows: rows of the flow.
    # num_cols: cols of the flow.
    grid = np.zeros((4, num_cols, num_rows))

    row_step = 1/ (num_rows - 1)
    col_step = 1/ (num_cols - 1)

    for i in range(num_rows):
        transform = GetHomographyTransformFromProjections(real_projections_src[i], real_projections_dst[i])
        v = i * row_step
        for j in range(num_cols):
            u = j * col_step
            point = np.array([u * frame_width, v * frame_height, 1]).T
            warped_point = ApplyTransform(transform, point)
            warped_point = warped_point / np.array([frame_width, frame_height, 1]) # normalize
            grid[:, j, i] = np.array([warped_point[0], warped_point[1], u, v])
    return grid

def torch_GetWarpingFlow(static_options, real_projections_src, real_projections_dst, USE_CUDA = True):
    # real_projections: a set of 3x3 projections.
    # virtual_projection: a single 3x3 projection.
    batch_size = real_projections_src.size()[0]

    grid = torch.zeros((batch_size, 4, static_options["num_grid_cols"], static_options["num_grid_rows"]))
    if USE_CUDA:
        grid = grid.cuda()
    width = static_options["width"]
    height = static_options["height"]

    row_step = 1/ (static_options["num_grid_rows"] - 1)
    col_step = 1/ (static_options["num_grid_cols"] - 1)

    for i in range(static_options["num_grid_rows"]):
        transform = torch_GetHomographyTransformFromProjections(real_projections_src[:, i], real_projections_dst[:, i])
        v = i * row_step
        for j in range(static_options["num_grid_cols"]):
            u = j * col_step
            point = torch.Tensor([u * width, v * height, 1])
            norm = torch.Tensor([width, height, 1])
            if USE_CUDA == True:
                point = point.cuda()
                norm = norm.cuda()
            warped_point = torch_ApplyTransform(transform, point)
            warped_point = warped_point / norm # normalize
            grid[:, 0, j, i] = warped_point[:,0]
            grid[:, 1, j, i] = warped_point[:,1]
            grid[:, 2, j, i] = u
            grid[:, 3, j, i] = v
    return grid

def GetHomographyTransformFromProjections(proj_src, proj_dst):
    return np.matmul(proj_dst, LA.inv(proj_src))

def torch_GetHomographyTransformFromProjections(proj_src, proj_dst):
    return torch.matmul(proj_dst, torch.inverse(proj_src))

def ApplyTransform(transform, point):
    # Warps a 2D point ([x y 1]) using a homography transform.
    # Returns the warped 2D point ([warped_x, warped_y, 1]).
    z = np.matmul(transform, point)
    z = z / z[2]
    return z

def torch_ApplyTransform(transform, point):
    # Warps a 2D point ([x y 1]) using a homography transform.
    # Returns the warped 2D point ([warped_x, warped_y, 1]).
    z = torch.matmul(transform, point)
    z = z / z[:,2:]
    return z

def CenterZoom(grid, ratio):
    grid[:, 0:2, :, :]  = (grid[:, 0:2, :, :] - 0.5) * ratio + 0.5
    return grid


================================================
FILE: dvs/gyro/gyro_io.py
================================================
import numpy as np
from numpy import linalg as LA
import matplotlib.pyplot as plt
import scipy.io as sio
from .gyro_function import (
    ProcessGyroData, QuaternionProduct, QuaternionReciprocal, 
    ConvertQuaternionToAxisAngle, FindOISAtTimeStamp, GetMetadata,
    GetProjections, GetVirtualProjection, GetForwardGrid,
    CenterZoom, GetGyroAtTimeStamp, get_static, ConvertAxisAngleToQuaternion,
    ConvertAxisAngleToQuaternion_no_angle, ConvertQuaternionToAxisAngle_no_angle
    )

def load_gyro_mesh(input_name):
    data = LoadStabResult(input_name)
    w, h  = data["vertex_grid_size"][0]
    data["warping grid"] = np.reshape(data["warping grid"],(-1,int(w),int(h),4))
    return data

def get_grid(static_options, frame_data, quats_data, ois_data, virtual_data, no_shutter = False):
    grid = []
    result_poses = {}
    result_poses['virtual pose'] = virtual_data
    for i in range(len(virtual_data)):
        metadata = GetMetadata(frame_data, i)
        real_projections = GetProjections(static_options, metadata, quats_data, ois_data, no_shutter = no_shutter)
        virtual_projection = GetVirtualProjection(static_options, result_poses, metadata, i) 
        grid.append(GetForwardGrid(static_options, real_projections, virtual_projection))
    grid = np.array(grid)
    zoom_ratio = 1 / (1 - 2 * static_options["cropping_ratio"])
    curr_grid = CenterZoom(grid, zoom_ratio)
    curr_grid = np.transpose(curr_grid,(0,3,2,1))
    return curr_grid

def get_rotations(frame_data, quats_data, ois_data, num_frames):
    quats = np.zeros((num_frames, 4)) 
    for i in range(num_frames):
        quats[i,:] = GetGyroAtTimeStamp(quats_data, frame_data[i,0])

    rotations = np.zeros((num_frames,3))
    lens_offsets = np.zeros((num_frames, 2)) 
    for i in range(num_frames):
        if i != 0:
            quat_dif = QuaternionProduct(quats[i,:], QuaternionReciprocal(quats[i-1,:])) 
            axis_dif_cur = ConvertQuaternionToAxisAngle_no_angle(quat_dif)
            rotations[i,:] = axis_dif_cur
        lens_offsets[i,:] = FindOISAtTimeStamp(ois_data, frame_data[i, 4])     

    return rotations, lens_offsets

def visual_rotation(rotations_real, lens_offsets_real, rotations_virtual, lens_offsets_virtual, rotations_virtual2, lens_offsets_virtual2, path):
    # figure('units','normalized','outerposition',[0 0 1 1])
    plt.clf()
    plt.figure(figsize=(8,16))
    
    plt.subplot(5,1,1)
    plt.plot(rotations_real[:,0], "g")
    if rotations_virtual is not None:
        plt.plot(rotations_virtual[:,0], "b")
    if rotations_virtual2 is not None:
        plt.plot(rotations_virtual2[:,0], "r")
    plt.ylim(-0.02, 0.02)
    plt.xlabel('frame id')
    plt.ylabel('gyro x')

    plt.subplot(5,1,2)
    plt.plot(rotations_real[:,1], "g")
    if rotations_virtual is not None:
        plt.plot(rotations_virtual[:,1], "b")
    if rotations_virtual2 is not None:
        plt.plot(rotations_virtual2[:,1], "r")
    plt.ylim(-0.02, 0.02)
    plt.xlabel('frame id')
    plt.ylabel('gyro y')

    plt.subplot(5,1,3)
    plt.plot(rotations_real[:,2], "g")
    if rotations_virtual is not None:
        plt.plot(rotations_virtual[:,2], "b")
    if rotations_virtual2 is not None:
        plt.plot(rotations_virtual2[:,2], "r")
    plt.ylim(-0.02, 0.02)
    plt.xlabel('frame id')
    plt.ylabel('gyro z')
    
    plt.subplot(5,1,4)
    plt.plot(lens_offsets_real[:,0], "g")
    if lens_offsets_virtual is not None:
        plt.plot(lens_offsets_virtual[:,0], "b")
    if rotations_virtual2 is not None:
        plt.plot(lens_offsets_virtual2[:,0], "r")
    plt.xlabel('frame id')
    plt.ylabel('ois x')

    plt.subplot(5,1,5)
    plt.plot(lens_offsets_real[:,1], "g")
    if lens_offsets_virtual is not None:
        plt.plot(lens_offsets_virtual[:,1], "b")
    if rotations_virtual2 is not None:
        plt.plot(lens_offsets_virtual2[:,1], "r")
    plt.xlabel('frame id')
    plt.ylabel('ois y')
    
    plt.savefig(path[:-4]+".jpg")
    return

def LoadOISData(ois_name):
    ois_log = np.loadtxt(ois_name)
    ois_log = ois_log[:, -3:]
    return ois_log

def LoadFrameData(frame_log_name):
    frame_data = np.loadtxt(frame_log_name)
    frame_data[:, [0,4]] = frame_data[:, [0,4]] - np.expand_dims(frame_data[:,1]/2, axis = 1)
    return frame_data


def LoadGyroData(gyro_log_name):
    raw_gyro_data = np.loadtxt(gyro_log_name) 
    raw_gyro_data[:,0] = raw_gyro_data[:,0] * 1000 
    raw_gyro_data = raw_gyro_data[:,[0, 2, 1, 3]]

    [_, quats_data]  = ProcessGyroData(raw_gyro_data) 
    quats_data = np.concatenate((raw_gyro_data[:, 0, None], quats_data), axis = 1)
    return quats_data

def LoadStabResult(input_name):
    fid = open(input_name)
    data = {}
    while True:
        name, val = ReadLine(fid)
        if name == None:
            break
        if name in data:
            data[name] = np.concatenate((data[name], val), axis=0)
        else:
            data[name] = val
    fid.close()
    print("Mesh length: ", len(list(data.values())[0]))
    return data


def ReadLine(fid):
    name = ''
    val = 0
    tline = fid.readline()
    if len(tline) == 0:
        return None, None
    if tline[-1] == "\n":
        tline = tline[:-1]
    ind = tline.find(':')
    name = tline[:ind]
    tmp_val= str2num(tline[ind+1:])
    if len(tmp_val) > 0:
        val = tmp_val
    else:
        tline = fid.readline()
        if tline[-1] == "\n":
            tline = tline[:-1]
        val = str2num(tline)
    return name, np.expand_dims(np.array(val), axis=0)

def str2num(string):
    nums = string.split(" ")
    nums = [float(_) for _ in nums if _ != ""]
    return nums
    
    
================================================
FILE: dvs/inference.py
================================================
import os
import sys
import torch
import torchvision
import torch.nn as nn
from torch.autograd import Variable

import time
import yaml
import argparse
import numpy as np
from printer import Printer
from dataset import get_data_loader, get_inference_data_loader
from model import Model
import datetime
import copy
from util import make_dir, get_optimizer, norm_flow
from gyro import (
    get_grid, 
    get_rotations, 
    visual_rotation,
    torch_QuaternionProduct,
    torch_norm_quat
    )
from warp import warp_video

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def run(model, loader, cf, USE_CUDA=True):
    no_flo = False
    number_virtual, number_real = cf['data']["number_virtual"], cf['data']["number_real"]
    model.net.eval()
    model.unet.eval()
    activation = nn.Softshrink(0.0006) # 0.0036
    for i, data in enumerate(loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        real_inputs, times, flo, flo_back, real_projections, real_postion, ois, real_queue_idx = data
        print("Fininsh Load data")

        real_inputs = real_inputs.type(torch.float) #[b,60,84=21*4]
        real_projections = real_projections.type(torch.float) 
        flo = flo.type(torch.float) 
        flo_back = flo_back.type(torch.float) 
        ois = ois.type(torch.float)

        batch_size, step, dim = real_inputs.size()
        times = times.numpy()
        real_queue_idx = real_queue_idx.numpy()
        virtual_queue = [None] * batch_size

        run_loss = 0
        model.net.init_hidden(batch_size)
        count = 0
        for j in range(step):
            if (j+1) % 100 == 0:
                print("Step: "+str(j+1)+"/"+str(step))
            virtual_inputs, vt_1 = loader.dataset.get_virtual_data(
                virtual_queue, real_queue_idx, times[:, j], times[:, j+1], times[:, 0], batch_size, number_virtual, real_postion[:,j]) 
            real_inputs_step = real_inputs[:,j,:]
            inputs = torch.cat((real_inputs_step,virtual_inputs), dim = 1) 

            # inputs = Variable(real_inputs_step)
            if USE_CUDA:
                real_inputs_step = real_inputs_step.cuda()
                virtual_inputs = virtual_inputs.cuda()
                inputs = inputs.cuda()
                if no_flo is False:
                    flo_step = flo[:,j].cuda()
                    flo_back_step = flo_back[:,j].cuda()
                else:
                    flo_step = None
                    flo_back_step = None
                vt_1 = vt_1.cuda()
                real_projections_t = real_projections[:,j+1].cuda()
                real_projections_t_1 = real_projections[:,j].cuda()
                real_postion_anchor = real_postion[:,j].cuda()
                ois_step = ois[:,j].cuda()

            if no_flo is False:
                b, h, w, _ = flo_step.size()
                flo_step = norm_flow(flo_step, h, w)
                flo_back_step = norm_flow(flo_back_step, h, w)

            with torch.no_grad():
                if no_flo is False:
                    flo_out = model.unet(flo_step, flo_back_step)
                else:
                    flo_out = None
                if j < 1:
                    for i in range(2):
                        out = model.net(inputs, flo_out, ois_step)
                else:
                    out = model.net(inputs, flo_out, ois_step)

            real_position = real_inputs_step[:,40:44]
            virtual_position = virtual_inputs[:, -4:]

            out[:, :3] = activation(out[:, :3])
            out = torch_norm_quat(out)

            pos = torch_QuaternionProduct(virtual_position, real_postion_anchor)
            loss_step = model.loss(out, vt_1, virtual_inputs, real_inputs_step, \
                flo_step, flo_back_step, real_projections_t, real_projections_t_1, real_postion_anchor, \
                follow = True, optical = True, undefine = True)
            run_loss += loss_step

            out = torch_QuaternionProduct(out, pos)

            if USE_CUDA:
                out = out.cpu().detach().numpy() 

            virtual_queue = loader.dataset.update_virtual_queue(batch_size, virtual_queue, out, times[:,j+1])
    
    run_loss /= step
    print( "\nLoss: follow, angle, smooth, c2_smooth, undefine, optical")
    print(run_loss.cpu().numpy()[:-1], "\n")
    return np.squeeze(virtual_queue, axis=0)


def inference(cf, data_path, USE_CUDA):
    checkpoints_dir = cf['data']['checkpoints_dir']
    checkpoints_dir = make_dir(checkpoints_dir, cf)
    files = os.listdir(data_path)
    for f in files:
        if f[-3:] == "mp4" and "no_ois" not in f  and "no_shutter" not in f  and "gimbal" not in f.lower() and "grid" not in f.lower() and "flo" not in f.lower():
            video_name = f[:-4]

    # Define the model
    model = Model(cf) 
    load_model = cf["model"]["load_model"]

    print("------Load Pretrined Model--------")
    if load_model is not None:
        checkpoint = torch.load(load_model)
        print(load_model)
    else:
        load_last = os.path.join(checkpoints_dir, cf['data']['exp']+'_last.checkpoint')
        checkpoint = torch.load(load_last)
        print(load_last)
    model.net.load_state_dict(checkpoint['state_dict'])
    model.unet.load_state_dict(checkpoint['unet'])
                
    if USE_CUDA:
        model.net.cuda()
        model.unet.cuda()

    print("-----------Load Dataset----------")
    test_loader = get_inference_data_loader(cf, data_path, no_flo = False)
    data = test_loader.dataset.data[0]

    start_time = time.time()
    virtual_queue= run(model, test_loader, cf, USE_CUDA=USE_CUDA)

    virtual_data = np.zeros((1,5))
    virtual_data[:,1:] = virtual_queue[0, 1:]
    virtual_data[:,0] = data.frame[0,0]
    virtual_queue = np.concatenate((virtual_data, virtual_queue), axis = 0)

    print(virtual_queue.shape)
    time_used = (time.time() - start_time) / 60

    print("Time_used: %.4f minutes" % (time_used))

    
    virtual_path = os.path.join("./test", cf['data']['exp'], data_path.split("/")[-1]+'.txt')
    np.savetxt(virtual_path, virtual_queue, delimiter=' ')

    print("------Start Warping Video--------")
    grid = get_grid(test_loader.dataset.static_options, \
        data.frame[:data.length], data.gyro, data.ois, virtual_queue[:data.length,1:], no_shutter = False)
    return data, virtual_queue, video_name, grid

def visual_result(cf, data, video_name, virtual_queue, virtual_queue2 = None, compare_exp = None):
    print("------Start Visual Result--------")
    rotations_virtual, lens_offsets_virtual = get_rotations(data.frame[:data.length], virtual_queue, np.zeros(data.ois.shape), data.length)
    rotations_real, lens_offsets_real = get_rotations(data.frame[:data.length], data.gyro, data.ois, data.length)
    if virtual_queue2 is not None:
        rotations_virtual2, lens_offsets_virtual2 = get_rotations(data.frame[:data.length], virtual_queue2, np.zeros(data.ois.shape), data.length)
        path = os.path.join("./test", cf['data']['exp'], video_name+'_'+compare_exp+'.jpg')
    else:
        rotations_virtual2, lens_offsets_virtual2 = None, None
        path = os.path.join("./test", cf['data']['exp'], video_name+'.jpg')
    
    visual_rotation(rotations_real, lens_offsets_real, rotations_virtual, lens_offsets_virtual, rotations_virtual2, lens_offsets_virtual2, path)


def main(args = None):
    config_file = args.config
    dir_path = args.dir_path
    cf = yaml.load(open(config_file, 'r'))

    USE_CUDA = cf['data']["use_cuda"]

    log_file = open(os.path.join(cf["data"]["log"], cf['data']['exp']+'_test.log'), 'w+')
    printer = Printer(sys.stdout, log_file).open()

    data_name = sorted(os.listdir(dir_path))
    for i in range(len(data_name)):
        print("Running Inference: " + str(i+1) + "/" + str(len(data_name)))
        save_path = os.path.join("./test", cf['data']['exp'], data_name[i]+'_stab.mp4')

        data_path = os.path.join(dir_path, data_name[i])
        data, virtual_queue, video_name, grid= inference(cf, data_path, USE_CUDA)

        virtual_queue2 = None
        visual_result(cf, data, data_name[i], virtual_queue, virtual_queue2 = virtual_queue2, compare_exp = None)

        video_path = os.path.join(data_path, video_name+".mp4")
        warp_video(grid, video_path, save_path, frame_number = False)
    return 

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Training model")
    parser.add_argument("--config", default="./conf/stabilzation.yaml", help="Config file.")
    parser.add_argument("--dir_path", default="./video")
    args = parser.parse_args()
    main(args = args)

================================================
FILE: dvs/load_frame_sensor_data.py
================================================
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import sys
import torch
import torchvision
import torch.nn as nn
from torch.autograd import Variable

import time
import yaml
import argparse
import numpy as np
from printer import Printer
from dataset import get_data_loader, get_inference_data_loader
from model import Model
import datetime
import copy
from util import make_dir, get_optimizer, norm_flow
from gyro import (
    get_grid, 
    get_rotations, 
    visual_rotation, 
    GetGyroAtTimeStamp, 
    torch_ConvertQuaternionToAxisAngle, 
    torch_ConvertAxisAngleToQuaternion,
    torch_QuaternionProduct,
    get_static
    )
from warp import warp_video

def run(loader, cf, USE_CUDA=True):
    number_virtual, number_real = cf['data']["number_virtual"], cf['data']["number_real"]
    for i, data in enumerate(loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        real_inputs, times, flo, flo_back, real_projections, real_postion, ois, real_queue_idx = data
        print("Fininsh Load data")

        real_inputs = real_inputs.type(torch.float) #[b,60,84=21*4]
        real_projections = real_projections.type(torch.float) 
    
        batch_size, step, dim = real_inputs.size()
        times = times.numpy()
        real_queue_idx = real_queue_idx.numpy()
        virtual_queue = [None] * batch_size

        for j in range(step):
            virtual_inputs, vt_1 = loader.dataset.get_virtual_data(
                virtual_queue, real_queue_idx, times[:, j], times[:, j+1], times[:, 0], batch_size, number_virtual, real_postion[:,j]) 
            real_inputs_step = real_inputs[:,j,:]
            if USE_CUDA:
                real_inputs_step = real_inputs_step.cuda()
                virtual_inputs = virtual_inputs.cuda()
                real_postion_anchor = real_postion[:,j].cuda()

            out = real_inputs_step[:,40:44]
            
            virtual_position = virtual_inputs[:, -4:]
            pos = torch_QuaternionProduct(virtual_position, real_postion_anchor)

            out = torch_QuaternionProduct(out, pos)

            if USE_CUDA:
                out = out.cpu().detach().numpy() 

            virtual_queue = loader.dataset.update_virtual_queue(batch_size, virtual_queue, out, times[:,j+1])
    return np.squeeze(virtual_queue, axis=0)

def inference(cf, data_path, USE_CUDA):
    print("-----------Load Dataset----------")
    test_loader = get_inference_data_loader(cf, data_path)
    data = test_loader.dataset.data[0]
    test_loader.dataset.no_flo = True
    test_loader.dataset.static_options = get_static(ratio = 0)

    start_time = time.time()
    virtual_queue = run(test_loader, cf, USE_CUDA=USE_CUDA)

    virtual_data = np.zeros((1,5))
    virtual_data[:,1:] = virtual_queue[0, 1:]
    virtual_data[:,0] = data.frame[0,0]
    virtual_queue = np.concatenate((virtual_data, virtual_queue), axis = 0)
    
    files = os.listdir(data_path)
    for f in files:
        if f[-3:] == "mp4" and "no_ois" not in f and "gimbal" not in f.lower():
            video_name = f[:-4]
            print(video_name)
    virtual_path = os.path.join("./test", cf['data']['exp'], video_name+'.txt')

    print("------Start Visual Result--------")
    rotations_real, lens_offsets_real = get_rotations(data.frame[:data.length], data.gyro, data.ois, data.length)
    fig_path = os.path.join(data_path, video_name+"_real.jpg")
    visual_rotation(rotations_real, lens_offsets_real, None, None, None, None, fig_path)
    
    return

def main(args = None):
    config_file = args.config
    dir_path = args.dir_path
    cf = yaml.load(open(config_file, 'r'))
    
    USE_CUDA = cf['data']["use_cuda"]

    checkpoints_dir = cf['data']['checkpoints_dir']
    checkpoints_dir = make_dir(checkpoints_dir, cf)

    data_name = sorted(os.listdir(dir_path))
    for i in range(len(data_name)):
        print("Running: " + str(i+1) + "/" + str(len(data_name)))
        inference(cf, os.path.join(dir_path, data_name[i]), USE_CUDA)
    return 

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Training model")
    parser.add_argument("--config", default="./conf/stabilzation.yaml", help="Config file.")
    parser.add_argument("--dir_path", default="./video")
    args = parser.parse_args()
    main(args = args)

================================================
FILE: dvs/loss.py
================================================
import torch
import numpy as np
from torch.autograd import Variable
import operator
import torch.nn.functional as F
import matplotlib.pyplot as plt
from gyro import (
    torch_QuaternionProduct, 
    torch_QuaternionReciprocal, 
    get_static, 
    torch_GetVirtualProjection,
    torch_GetForwardGrid,
    torch_GetWarpingFlow,
    torch_ConvertAxisAngleToQuaternion,
    torch_ConvertQuaternionToAxisAngle,
    torch_norm_quat,
    torch_GetHomographyTransformFromProjections,
    torch_ApplyTransform
)
        
class C2_Smooth_loss(torch.nn.Module):
    def __init__(self):
        super(C2_Smooth_loss, self).__init__()
        self.MSE = torch.nn.MSELoss()

    def forward(self, Qt, Qt_1, Qt_2):
        detaQt_1 = torch_QuaternionProduct(Qt_1, torch_QuaternionReciprocal(Qt_2))
        return self.MSE(Qt, detaQt_1)

class C1_Smooth_loss(torch.nn.Module):
    def __init__(self):
        super(C1_Smooth_loss, self).__init__()
        self.MSE = torch.nn.MSELoss()

    def forward(self, v_r_axis, v_axis_t_1 = None, real_postion = None):
        quat_zero = torch.zeros(v_r_axis.shape).cuda()
        quat_zero[:,3] = 1
        return self.MSE(v_r_axis, quat_zero)

class Follow_loss(torch.nn.Module):
    def __init__(self):
        super(Follow_loss, self).__init__()
        self.MSE = torch.nn.MSELoss()

    def forward(self, virtual_quat, real_quat, real_postion = None):
        if real_postion is not None:
            real_quat = torch_QuaternionProduct(real_quat, real_postion)
        return self.MSE(virtual_quat, real_quat)

class Stay_loss(torch.nn.Module):
    def __init__(self):
        super(Stay_loss, self).__init__()
        self.zero = torch.tensor([0.0,0.0,0.0,1.0]).cuda()

    def forward(self, virtual_quat):
        return torch.mean(torch.abs(virtual_quat - self.zero))


class Angle_loss(torch.nn.Module):
    def __init__(self):
        super(Angle_loss, self).__init__()
        self.MSE = torch.nn.MSELoss()

    def forward(self, Q1, Q2, threshold = 0.5236, logistic_beta1 = 100):
        batch_size = Q1.shape[0]
        Q3 = torch_norm_quat(torch_QuaternionProduct(Q2, torch_QuaternionReciprocal(Q1)))
        theta = torch.zeros(batch_size).cuda()
        index = (Q3[:,3] < 1).nonzero()
        theta[index] = torch.acos(Q3[index,3]) * 2
        loss = torch.mean(theta * (1 / (1 + torch.exp(-logistic_beta1 * (theta - threshold)))))
        return loss, theta

class Optical_loss(torch.nn.Module):
    def __init__(self):
        super(Optical_loss, self).__init__()
        self.static_options = get_static()
        self.mesh = get_mesh()

    def forward(self, Vt, Vt_1, flo, flo_back, real_projection_t, real_projection_t_1):
        virtual_projection_t = torch_GetVirtualProjection(self.static_options, Vt) 
        virtual_projection_t_1 = torch_GetVirtualProjection(self.static_options, Vt_1) 

        b, h, w = flo.size()[:3]

        grid_t = torch_GetForwardGrid(self.static_options, real_projection_t, virtual_projection_t)[:,:2,:,:].permute(0,1,3,2)
        grid_t = torch.nn.functional.upsample_bilinear(grid_t, size = (h, w)) # [B,C(xy),H,W]

        grid_t_1 = torch_GetForwardGrid(self.static_options, real_projection_t_1, virtual_projection_t_1)[:,:2,:,:].permute(0,1,3,2) 
        grid_t_1 = torch.nn.functional.upsample_bilinear(grid_t_1, size = (h, w)) # [B,C(xy),H,W]
        
        mesh = self.mesh.repeat(b, 1, 1, 1)
        flo = flo + mesh 
        flo_back = flo_back + mesh # [B,H,W,C]

        valid = (flo[:,:,:,0] > 0) * (flo[:,:,:,1] > 0) * (flo[:,:,:,0] < 1) * (flo[:,:,:,1] < 1)
        valid_f = torch.unsqueeze(valid, dim = 3).type(torch.cuda.FloatTensor)
        valid = torch.unsqueeze(valid, dim = 1).type(torch.cuda.FloatTensor)

        valid_back = (flo_back[:,:,:,0] > 0) * (flo_back[:,:,:,1] > 0) * (flo_back[:,:,:,0] < 1) * (flo_back[:,:,:,1] < 1)
        valid_back_f = torch.unsqueeze(valid_back, dim = 3).type(torch.cuda.FloatTensor) 
        valid_back = torch.unsqueeze(valid_back, dim = 1).type(torch.cuda.FloatTensor) # [B,C,H,W]

        flo = (flo * 2 - 1) * valid_f
        flo_back = (flo_back * 2 - 1) * valid_back_f

        forward_t = torch.nn.functional.grid_sample(grid_t, flo, padding_mode="reflection") # default bilinear
        backward_t_1 = torch.nn.functional.grid_sample(grid_t_1, flo_back, padding_mode="reflection") # default bilinear

        forward_diff = ((forward_t - grid_t_1) * valid) ** 2 
        backward_diff = ((backward_t_1 - grid_t) * valid_back) ** 2

        forward_loss = torch.sum(forward_diff, dim = (1,2,3)) / torch.sum(valid, dim = (1,2,3))
        backward_loss = torch.sum(backward_diff, dim = (1,2,3)) / torch.sum(valid_back, dim = (1,2,3))

        loss = forward_loss + backward_loss
        loss = torch.min(loss, loss - loss + 1) #[0]
        loss = torch.sum(loss) / b

        return loss 


def get_mesh(height = 270, width = 480, USE_CUDA = True):
    xs = np.linspace(0, 1, width, endpoint = False) + 0.5 / height
    ys = np.linspace(0, 1, height, endpoint = False) + 0.5 / width
    xmesh, ymesh = np.meshgrid(xs, ys)
    # Reshape the sampling positions to a H x W x 2 tensor
    mesh = torch.Tensor(np.expand_dims(np.moveaxis(np.array(list(zip(xmesh, ymesh))), 1, 2),axis=0))
    if USE_CUDA:
        mesh = mesh.cuda()
    return mesh

class Undefine_loss(torch.nn.Module):
    def __init__(self, ratio = 0.08, inner_ratio = 0.04, USE_CUDA = True):
        super(Undefine_loss, self).__init__()
        self.static_options = get_static() 
        self.inner_ratio = inner_ratio
        width = self.static_options["width"]
        height = self.static_options["height"]
        x0, x1, y0, y1 = \
            int(width*ratio), int(width*(1-ratio)), int(height*ratio), int(height*(1-ratio))
        self.norm = torch.Tensor([width, height, 1])
        self.p00 = torch.Tensor([x0, y0, 1])
        self.p01 = torch.Tensor([x0, y1, 1])
        self.p10 = torch.Tensor([x1, y0, 1])
        self.p11 = torch.Tensor([x1, y1, 1])
        if USE_CUDA == True:
            self.p00 = self.p00.cuda()
            self.p01 = self.p01.cuda()
            self.p10 = self.p10.cuda()
            self.p11 = self.p11.cuda()
            self.norm = self.norm.cuda()

    def forward(self, Vt, Rt, ratio = 0.04):
        batch_size = Vt.size()[0]

        row_mid = self.static_options["num_grid_rows"] // 2
        virtual_projection_t = torch_GetVirtualProjection(self.static_options, Vt) 

        real_projection_t = torch_GetVirtualProjection(self.static_options, Rt) 

        # virtual projection and real projection
        transform = torch_GetHomographyTransformFromProjections(real_projection_t, virtual_projection_t)
        
        p00 = (torch_ApplyTransform(transform, self.p00) / self.norm)[:,:2]
        p01 = (torch_ApplyTransform(transform, self.p01) / self.norm)[:,:2]
        p10 = (torch_ApplyTransform(transform, self.p10) / self.norm)[:,:2]
        p11 = (torch_ApplyTransform(transform, self.p11) / self.norm)[:,:2]

        loss = torch.stack((self.get_loss(p00), self.get_loss(p01), self.get_loss(p10), self.get_loss(p11)),dim = 1)
        loss,_ = torch.max(loss, dim = 1)

        loss = torch.min(loss, loss - loss + 1) #[0]
        loss = torch.sum(loss) / batch_size

        return loss
    
    def get_loss(self, p):
        d =  (p - self.inner_ratio) * (p < self.inner_ratio).type(torch.cuda.FloatTensor) + \
            (1 - self.inner_ratio - p) * (p > (1 - self.inner_ratio)).type(torch.cuda.FloatTensor)
        return torch.sum(d**2, dim = 1) 


================================================
FILE: dvs/metrics.py
================================================
import os
import sys
import numpy as np
import cv2
import math
import pdb
import matplotlib.pyplot as plt
from printer import Printer
from warp import video2frame_one_seq
import datetime
import torch
import copy
import csv
import copyreg
import shutil
import matplotlib.pyplot as plt
from util import crop_video

def _pickle_keypoints(point):
    return cv2.KeyPoint, (*point.pt, point.size, point.angle,
                          point.response, point.octave, point.class_id)

copyreg.pickle(cv2.KeyPoint().__class__, _pickle_keypoints)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

h_size = 480
w_size = 640

def crop_metric(M):
    points = np.array([[0,0,1],[0,h_size,1], [w_size,0,1], [w_size,h_size,1]]).T
    result = np.matmul(M,points).T
    result = result[:,:2]/result[:,2:]
    w_out = 1 - max(result[0,0], result[1,0], w_size - result[2,0], w_size - result[3,0], 0)/w_size
    h_out = 1 - max(result[0,1], result[2,1], h_size - result[1,1], h_size - result[3,1], 0)/h_size
    return w_out, h_out

# https://stackoverflow.com/questions/34389125/how-to-get-the-scale-factor-of-getperspectivetransform-in-opencv
def get_scale(M):
    h1 = M[0, 0]
    h2 = M[0, 1]
    h3 = M[0, 2]
    h4 = M[1, 0]
    h5 = M[1, 1]
    h6 = M[1, 2]
    h7 = M[2, 0]
    h8 = M[2, 1]
    QR = np.array([[h1-(h7*h3), h2-(h8*h3)], [h4-(h7*h6), h5-(h8*h6)]])
    Q, R = np.linalg.qr(QR)
    return abs(R[0,0]), abs(R[1,1])

# https://stackoverflow.com/questions/21019338/how-to-change-the-homography-with-the-scale-of-the-image
def get_rescale_matrix(M, sx, sy):
    S = np.eye(3, dtype = float)
    S[0,0] = sx
    S[1,1] = sy

    S1 = np.eye(3, dtype = float)
    S1[0,0] = 1/sx
    S1[1,1] = 1/sy
    return np.matmul(M, S1)

# Part of code reference from https://github.com/jinsc37/DIFRINT/blob/master/metrics.py
def metrics(in_src, out_src, package, crop_scale = False, re_compute = False):
    load_dic = None
    if re_compute and os.path.exists(package):
        print("Start load")
        load_dic = torch.load(package)
        print("Finish load")
    dic = {
        'M': None,
        'CR_seq': [],
        'DV_seq': [],
        'SS_t': None,
        'SS_r': None,
        'w_crop':[],
        'h_crop':[],
        'distortion': [],
        'count': 0,
        'in_sift': {},
        'out_sift': {},
        'fft_t': {},
        'fft_r': {}
        }

    if load_dic is not None:
        dic["in_sift"] = load_dic["in_sift"]
        dic["out_sift"] = load_dic["out_sift"]

    frameList_in = sorted(os.listdir(in_src))
    frameList = sorted(os.listdir(out_src))
    frameList = frameList[:min(len(frameList_in),len(frameList))]

    # Create brute-force matcher object
    bf = cv2.BFMatcher()

    # Apply the homography transformation if we have enough good matches 
    MIN_MATCH_COUNT = 10 #10

    ratio = 0.7 #0.7
    thresh = 5.0 #5.0

    Pt = np.asarray([[1.0,0.0,0.0],[0.0,1.0,0.0],[0.0,0.0,1.0]])
    P_seq = []
    count = 1
    for index, f in enumerate(frameList, 0):
        if f.endswith('.png'):
            # Load the images in gray scale
            img1 = cv2.imread(os.path.join(in_src, f), 0)  
            img1 = cv2.resize(img1, (w_size,h_size), interpolation = cv2.INTER_LINEAR)

            img1o = cv2.imread(os.path.join(out_src, f), 0)
            img1o = cv2.resize(img1o, (w_size,h_size), interpolation = cv2.INTER_LINEAR)
            sift = cv2.SIFT_create()   
            
            if f in dic["in_sift"]:
                keyPoints1, descriptors1 = dic["in_sift"][f]
            else:
                # Detect the SIFT key points and compute the descriptors for the two images
                keyPoints1, descriptors1 = sift.detectAndCompute(img1, None)
                dic["in_sift"][f] = (keyPoints1, descriptors1)

            if f in dic["out_sift"]:
                keyPoints1o, descriptors1o = dic["out_sift"][f]
            else:
                keyPoints1o, descriptors1o = sift.detectAndCompute(img1o, None)
                dic["out_sift"][f] = (keyPoints1o, descriptors1o)

            # Match the descriptors
            matches = bf.knnMatch(descriptors1, descriptors1o, k=2)

            # Select the good matches using the ratio test
            goodMatches = []

            for m, n in matches:
                if m.distance < ratio * n.distance:
                    goodMatches.append(m)

            if len(goodMatches) > MIN_MATCH_COUNT:
                # Get the good key points positions
                sourcePoints = np.float32([ keyPoints1[m.queryIdx].pt for m in goodMatches ]).reshape(-1, 1, 2)
                destinationPoints = np.float32([ keyPoints1o[m.trainIdx].pt for m in goodMatches ]).reshape(-1, 1, 2)
                
                M, mask = cv2.findHomography(sourcePoints, destinationPoints, method=cv2.RANSAC, ransacReprojThreshold=thresh)
                im_dst = cv2.warpPerspective(img1, M, (w_size,h_size))  

                cm = []
                for i in range(6):
                    for j in range(6):
                        hs = int(h_size * (0.2 + 0.1 * i))
                        he = int(h_size * (0.3 + 0.1 * i))
                        ws = int(w_size * (0.2 + 0.1 * j))
                        we = int(w_size * (0.3 + 0.1 * j))
                        cm.append(np.corrcoef(img1o[hs:he, ws:we].flat, im_dst[hs:he, ws:we].flat))
                dic["distortion"].append(cm)

                if crop_scale:
                    sx, sy = get_scale(M)
                    M_scale = get_rescale_matrix(M, sx, sy)
                    w_crop, h_crop = crop_metric(M_scale)
                else:
                    w_crop, h_crop = crop_metric(M)
                dic["w_crop"].append(w_crop)
                dic["h_crop"].append(h_crop)

            # Obtain Scale, Translation, Rotation, Distortion value
            sx = M[0, 0]
            sy = M[1, 1]
            scaleRecovered = math.sqrt(np.abs(sx*sy))

            w, _ = np.linalg.eig(M[0:2,0:2])
            w = np.sort(w)[::-1]
            DV = w[1]/w[0]
            #pdb.set_trace()

            dic["CR_seq"].append(1.0/scaleRecovered)
            dic["DV_seq"].append(DV)  

            # For Stability score calculation
            if count < len(frameList):
                f_path = f[:-9] + '%05d.png' % (int(f[-9:-4])+1)
                if f_path in dic["out_sift"]:
                    keyPoints2o, descriptors2o = dic["out_sift"][f_path]
                else:
                    img2o = cv2.imread(os.path.join(out_src, f_path), 0)
                    img2o = cv2.resize(img2o, (w_size,h_size), interpolation = cv2.INTER_LINEAR)
                    keyPoints2o, descriptors2o = sift.detectAndCompute(img2o, None)
                    dic["out_sift"][f_path] = (keyPoints2o, descriptors2o)
                
                matches = bf.knnMatch(descriptors1o, descriptors2o, k=2)
                goodMatches = []

                for m, n in matches:
                    if m.distance < ratio * n.distance:
                        goodMatches.append(m)

                if len(goodMatches) > MIN_MATCH_COUNT:
                    # Get the good key points positions
                    sourcePoints = np.float32([ keyPoints1o[m.queryIdx].pt for m in goodMatches ]).reshape(-1, 1, 2)
                    destinationPoints = np.float32([ keyPoints2o[m.trainIdx].pt for m in goodMatches ]).reshape(-1, 1, 2)
                    
                    # Obtain the homography matrix
                    M, mask = cv2.findHomography(sourcePoints, destinationPoints, method=cv2.RANSAC, ransacReprojThreshold=thresh)

                P_seq.append(np.matmul(Pt, M))
                Pt = np.matmul(Pt, M)
            if count % 10 ==0:
                sys.stdout.write('\rFrame: ' + str(count) + '/' + str(len(frameList)))
                sys.stdout.flush()
            dic["count"] = count
            count += 1

    # Make 1D temporal signals
    P_seq_t = np.asarray([1])
    P_seq_r = np.asarray([1])

    #pdb.set_trace()
    for Mp in P_seq:
        sx = Mp[0, 0]
        sy = Mp[1, 1]
        c = Mp[0, 2]
        f = Mp[1, 2]

        transRecovered = math.sqrt(c*c + f*f)
        thetaRecovered = math.atan2(sx, sy) * 180 / math.pi

        P_seq_t = np.concatenate((P_seq_t, [transRecovered]), axis=0)
        P_seq_r = np.concatenate((P_seq_r, [thetaRecovered]), axis=0)

    P_seq_t = np.delete(P_seq_t, 0)
    P_seq_r = np.delete(P_seq_r, 0)

    # FFT
    fft_t = np.fft.fft(P_seq_t)
    fft_r = np.fft.fft(P_seq_r)
    fft_t = abs(fft_t)**2
    fft_r = abs(fft_r)**2

    fft_t = np.delete(fft_t, 0)
    fft_r = np.delete(fft_r, 0)
    fft_t = fft_t[:int(len(fft_t)/2)]
    fft_r = fft_r[:int(len(fft_r)/2)]

    dic["fft_t"] = fft_t
    dic["fft_r"] = fft_r

    SS_t = np.sum(fft_t[:5])/np.sum(fft_t)  
    SS_r = np.sum(fft_r[:5])/np.sum(fft_r)

    dic["CR_seq"] = np.array(dic["CR_seq"])
    dic["DV_seq"] = np.array(dic["DV_seq"])
    dic["w_crop"] = np.array(dic["w_crop"])
    dic["h_crop"] = np.array(dic["h_crop"])
    dic["distortion"] = np.array(dic["distortion"])
    dic["SS_t"] = SS_t
    dic["SS_r"] = SS_r
    
    if not (re_compute and os.path.exists(package)):
        torch.save(dic, package)

    DV_seq = np.absolute(dic["DV_seq"])
    DV_seq = DV_seq[np.where((DV_seq >= 0.5) & (DV_seq <= 1))]
    Distortion = str.format('{0:.4f}', np.nanmin(DV_seq))
    Distortion_avg = str.format('{0:.4f}', np.nanmean(DV_seq))

    Trans = str.format('{0:.4f}', dic["SS_t"])
    Rot = str.format('{0:.4f}', dic["SS_r"])

    w_crop = crop_rm_outlier(dic["w_crop"])
    h_crop = crop_rm_outlier(dic["h_crop"])

    FOV = str.format( '{0:.4f}', min(np.nanmin(w_crop), np.nanmin(h_crop)) )
    FOV_avg = str.format( '{0:.4f}', (np.nanmean(w_crop)+np.nanmean(h_crop)) / 2 )

    Correlation_avg = str.format( '{0:.4f}', np.nanmean(dic["distortion"][10:]) )
    Correlation_min = str.format( '{0:.4f}', np.nanmin(dic["distortion"][10:]) )

    # Print results
    print('\n***Distortion value (Avg, Min):')
    print(Distortion_avg +' | '+  Distortion)
    print('***Stability Score (Avg, Trans, Rot):')
    print(str.format('{0:.4f}',  (dic["SS_t"]+dic["SS_r"])/2) +' | '+ Trans +' | '+ Rot )
    print("=================")
    print('***FOV ratio (Avg, Min):')
    print( FOV_avg +' | '+ FOV )
    print('***Correlation value (Avg, Min):')
    print( Correlation_avg +' | '+ Correlation_min , "\n")  

    dic['in_sift'] = 0
    dic['out_sift'] = 0
    torch.save(dic, package[:-3]+"_light.pt") 
    return float(FOV)

def crop_rm_outlier(crop):
    crop = np.array(crop)
    crop = crop[crop >= 0.5]
    return sorted(crop)[5:]

if __name__ == '__main__':
    metric_path = os.path.join("./test/stabilzation/metric")
    if not os.path.exists(metric_path):
        os.makedirs(metric_path)

    in_video = "./video/s_114_outdoor_running_trail_daytime/ControlCam_20200930_104820.mp4"
    in_folder = os.path.join(metric_path, "in_frame")
    if not os.path.exists(in_folder):
        os.makedirs(in_folder)
        print("Convert video to frames")
        video2frame_one_seq(in_video, in_folder)
        
    out_video = "./test/stabilzation/s_114_outdoor_running_trail_daytime_stab.mp4"
    out_folder = os.path.join(metric_path, "out_frame")
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
        print("Convert video to frames")
        video2frame_one_seq(out_video, out_folder)
    
    package = os.path.join(metric_path, "stabilzation.pt")
    FOV = metrics(in_folder, out_folder, package)

    crop_path = out_video[:-4] + "_crop.mp4"
    crop_video(out_video, crop_path, FOV)


================================================
FILE: dvs/model.py
================================================
import math
import torch
from collections import OrderedDict

import torch.nn as nn
import numpy as np
import util
import yaml
import os
from loss import C2_Smooth_loss, C1_Smooth_loss, Optical_loss, Undefine_loss, Angle_loss, Follow_loss, Stay_loss
from gyro import torch_norm_quat, torch_QuaternionProduct
import torch.nn.functional as F

Activates = {"sigmoid": nn.Sigmoid, "relu": nn.ReLU, "tanh": nn.Tanh}

class LayerLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, bias):
        super(LayerLSTM, self).__init__()
        self.LSTM = nn.LSTMCell(input_size, hidden_size, bias)
        self.hidden_size = hidden_size
    
    def init_hidden(self, batch_size):
        self.hx = torch.zeros((batch_size, self.hidden_size)).cuda()
        self.cx = torch.zeros((batch_size, self.hidden_size)).cuda()

    def forward(self, x):
        self.hx, self.cx = self.LSTM(x, (self.hx, self.cx))
        return self.hx
        

class LayerCNN(nn.Module):
    def __init__(self, in_channel, out_channel, kernel_size, stride, padding, pooling_size=None, 
                        activation_function=nn.ReLU, batch_norm=True):
        super(LayerCNN, self).__init__()
        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size, stride=stride, padding=padding)
        self.batch_norm = nn.BatchNorm2d(out_channel) if batch_norm else None
        self.activation = activation_function(inplace=True)
        if pooling_size is not None:
            self.pooling = nn.MaxPool2d(pooling_size)
        else:
            self.pooling = None
        
    def forward(self, x):
        x = self.conv(x)     #x->[batch,channel,height,width]
        if self.batch_norm is not None:
            x = self.batch_norm(x)
        x = self.activation(x)
        if self.pooling is not None:
            x = self.pooling(x)
        return x

class LayerFC(nn.Module):
    def __init__(self, in_features, out_features, bias, drop_out=0, activation_function=nn.ReLU, batch_norm = False):
        super(LayerFC, self).__init__()
        self.fc = nn.Linear(in_features, out_features, bias=bias)
        # self.activation = activation_function(inplace=True) if activation_function is not None else None
        self.activation = activation_function() if activation_function is not None else None
        self.dropout = nn.Dropout(p=drop_out,inplace=False) if drop_out else None
        self.batch_norm = nn.BatchNorm1d(out_features) if batch_norm else None
        
    def forward(self, x):
        if self.dropout is not None:
            x = self.dropout(x)
        x = self.fc(x)
        if self.batch_norm is not None:
            x = self.batch_norm(x)
        if self.activation is not None:
            x = self.activation(x)
        return x

class Net(nn.Module):
    def __init__(self, cf):
        super(Net, self).__init__()
        self.cnn_param = cf["model"]["cnn"]
        self.rnn_param = cf["model"]["rnn"]
        self.fc_param = cf["model"]["fc"]
        self.unit_size = 4
        self.no_flo = False

        if self.no_flo is False:
            self._rnn_input_size = (2*cf["data"]["number_real"]+1+cf["data"]["number_virtual"]) * 4 + 64
        else:
            self._rnn_input_size = (2*cf["data"]["number_real"]+1+cf["data"]["number_virtual"]) * self.unit_size

        #CNN Layers
        cnns = []
        cnn_activation = Activates[self.cnn_param["activate_function"]]
        cnn_batch_norm = self.cnn_param["batch_norm"]
        cnn_layer_param = self.cnn_param["layers"]
        if cnn_layer_param is not None:
            cnn_layers = len(cnn_layer_param)
            for layer in range(cnn_layers):
                in_channel = eval(cnn_layer_param[layer][0])[0]
                out_channel = eval(cnn_layer_param[layer][0])[1]
                kernel_size = eval(cnn_layer_param[layer][1])
                stride = eval(cnn_layer_param[layer][2])
                padding = eval(cnn_layer_param[layer][3])
                pooling_size = eval(cnn_layer_param[layer][4])

                cnn = None
                cnn = LayerCNN(in_channel, out_channel, kernel_size, stride, padding, pooling_size, 
                            activation_function=cnn_activation, batch_norm=cnn_batch_norm)
                cnns.append(('%d' % layer, cnn))
        
                self._rnn_input_size = int(math.floor((self._rnn_input_size+2*padding[1]-kernel_size[1])/stride[1])+1)
                if pooling_size is not None:
                    self._rnn_input_size = int(math.floor((self._rnn_input_size-pooling_size[1])/pooling_size[1])+1)
            self.convs = nn.Sequential(OrderedDict(cnns))

        else:
            self.convs = None
            out_channel = cf["data"]["channel_size"]
            
        self.gap = nn.AvgPool2d(self._rnn_input_size) if self.cnn_param["gap"] else None
        self._rnn_input_size = out_channel if self.cnn_param["gap"] else out_channel*(self._rnn_input_size)

        #RNN Layers
        rnns = []
        rnn_layer_param = self.rnn_param["layers"]
        rnn_layers = len(rnn_layer_param)
        
        for layer in range(rnn_layers):
            if layer:
                rnn = LayerLSTM(rnn_layer_param[layer-1][0], rnn_layer_param[layer][0], rnn_layer_param[layer][1])
            else:
                rnn = LayerLSTM(self._rnn_input_size, rnn_layer_param[layer][0], rnn_layer_param[layer][1])
            rnns.append(('%d'%layer, rnn))
        self.rnns = nn.Sequential(OrderedDict(rnns))

        self._fc_input_size = rnn_layer_param[rnn_layers-1][0] #* 2 # ois
        
        #FC Layers
        fcs = []
        fc_activation = Activates[self.fc_param["activate_function"]]
        fc_batch_norm = self.fc_param["batch_norm"]
        fc_layer_param = self.fc_param["layers"]
        fc_drop_out = self.fc_param["drop_out"]
        fc_layers = len(fc_layer_param)
        
        if fc_layers == 1:
            fc = LayerFC(self._fc_input_size,fc_layer_param[0][0],fc_layer_param[0][1],
                    fc_drop_out, None, fc_batch_norm)
            fcs.append(('%d'%(fc_layers-1), fc))
        else:
            for layer in range(fc_layers-1):
                if layer:
                    fc = LayerFC(fc_layer_param[layer-1][0],fc_layer_param[layer][0],fc_layer_param[layer][1],
                        fc_drop_out, fc_activation, fc_batch_norm)
                else:
                    fc = LayerFC(self._fc_input_size,fc_layer_param[layer][0],fc_layer_param[layer][1],
                        fc_drop_out,fc_activation, fc_batch_norm)
                fcs.append(('%d'%layer, fc))
            fc = LayerFC(fc_layer_param[fc_layers-2][0],fc_layer_param[fc_layers-1][0],fc_layer_param[fc_layers-1][1],
                        fc_drop_out,None, fc_batch_norm) # Modified
            fcs.append(('%d'%(fc_layers-1), fc))

        self.class_num = fc_layer_param[fc_layers-1][0]
        self.fcs = nn.Sequential(OrderedDict(fcs))

    def init_hidden(self, batch_size):
        for i in range(len(self.rnns)):
            self.rnns[i].init_hidden(batch_size)

    def forward(self, x, flo, ois):
        b,c = x.size()   #x->[batch,channel,height,width]
        if self.convs is not None:
            x = self.convs(x)
        if self.gap is not None:
            x = self.gap(x)
        x = x.view(b,-1)
        if self.no_flo is False:
            x = torch.cat((x, flo), dim = 1) 
        x = self.rnns(x)
        x = self.fcs(x) # [b, 4]
        x = torch_norm_quat(x)
        return x

class Model():
    def __init__(self, cf):
        super().__init__()
        self.net = Net(cf)
        self.unet = UNet()
        self.init_weights(cf)
        
        self.loss_smooth = C1_Smooth_loss()
        self.loss_follow = Follow_loss()
        self.loss_c2_smooth = C2_Smooth_loss()
        self.loss_optical = Optical_loss()
        self.loss_undefine = Undefine_loss(ratio = 0.08)
        self.loss_angle = Angle_loss()
        self.loss_stay = Stay_loss()

        self.loss_smooth_w = cf["loss"]["smooth"]
        self.loss_angle_w = cf["loss"]["angle"]
        self.loss_follow_w = cf["loss"]["follow"]
        self.loss_c2_smooth_w = cf["loss"]["c2_smooth"]
        self.loss_undefine_w = cf["loss"]["undefine"]
        self.loss_opt_w = cf["loss"]["opt"]
        self.loss_stay_w = cf["loss"]["stay"]

        self.gaussian_weight = np.array([0.072254, 0.071257, 0.068349, 0.063764, 0.057856, 0.051058, 0.043824, 0.036585, 0.029705, 0.023457, 0.01801])

    def loss(
        self, out, vt_1, virtual_inputs, real_inputs, flo, flo_back, 
        real_projections_t, real_projections_t_1, real_postion_anchor, 
        follow = True, undefine = True, optical = True, stay = False
        ):
        unit_size = self.net.unit_size
        mid = real_inputs.size()[1]//(2*unit_size) 

        Rt = real_inputs[:,unit_size*(mid):unit_size*(mid)+4] 
        v_pos = torch_QuaternionProduct(out, virtual_inputs[:, -4:])
        r_pos = torch_QuaternionProduct(v_pos, real_postion_anchor)

        loss = torch.zeros(7).cuda()
        if self.loss_follow_w > 0 and follow:
            for i in range(-2,3):
                loss[0] += self.loss_follow_w * self.loss_follow(v_pos, real_inputs[:,unit_size*(i+mid):unit_size*(i+mid)+4], None)
        if self.loss_angle_w > 0 and follow:
            threshold = 6 / 180 * 3.1415926
            loss_angle, theta = self.loss_angle(v_pos, Rt, threshold = threshold)
            loss[1] = self.loss_angle_w * loss_angle
        if self.loss_smooth_w > 0:
            loss_smooth = self.loss_smooth(out)
            loss[2] = self.loss_smooth_w * loss_smooth
        if self.loss_c2_smooth_w > 0: 
            loss[3] = self.loss_c2_smooth_w * self.loss_c2_smooth(out, virtual_inputs[:, -4:], virtual_inputs[:, -8:-4])
        if self.loss_undefine_w > 0 and undefine:
            Vt_undefine = v_pos.clone() 
            for i in range(0, 10, 2):
                Rt_undefine = real_inputs[:,unit_size*(mid+i):unit_size*(mid+i)+4]
                loss_undefine_w = self.loss_undefine_w * self.gaussian_weight[i]
                loss[4] +=  loss_undefine_w * self.loss_undefine(Vt_undefine, Rt_undefine)
                Vt_undefine = torch_QuaternionProduct(out, Vt_undefine)
                Vt_undefine = torch_QuaternionProduct(out, Vt_undefine)
        if self.loss_opt_w > 0 and optical:
            loss[5] = self.loss_opt_w * self.loss_optical(r_pos, vt_1, flo, flo_back, real_projections_t, real_projections_t_1) 
        if self.loss_stay_w > 0 and stay:
            loss[6] = self.loss_stay_w * self.loss_stay(out) 
        return loss


    def init_weights(self, cf):
        for m in self.net.modules():
            if isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv3d) or  isinstance(m, nn.Linear):
                if cf["train"]["init"] == "xavier_uniform":
                    nn.init.xavier_uniform_(m.weight.data)
                elif cf["train"]["init"] == "xavier_normal":
                    nn.init.xavier_normal_(m.weight.data)

        for m in self.unet.modules():
            if isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv3d) or  isinstance(m, nn.Linear):
                if cf["train"]["init"] == "xavier_uniform":
                    nn.init.xavier_uniform_(m.weight.data)
                elif cf["train"]["init"] == "xavier_normal":
                    nn.init.xavier_normal_(m.weight.data)

    def save_checkpoint(self, epoch = 0, optimizer=None):
        package = {
                'cnn': self.net.cnn_param,
                'fc': self.net.fc_param,
                'state_dict': self.net.state_dict(),
                }
        if optimizer is not None:
            package['optim_dict'] = optimizer.state_dict()
        if self.unet is not None:
            package['unet'] = self.unet.state_dict()
        package["epoch"] = epoch
        return package


class UNet(nn.Module):
    def __init__(self, n_channels = 4, n_classes = 16, bilinear=True):
        super(UNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.bilinear = bilinear

        self.inc = DoubleConv(n_channels, 8)
        self.down1 = Down(8, 16)
        self.down2 = Down(16, 32)
        self.down3 = Down(32, 64)
        # factor = 2 if bilinear else 1
        self.down4 = Down(64, 128)
        self._fc_input_size = 128 * 1 * 1
        self.fc = LayerFC(self._fc_input_size, 64, bias = True)

    def forward(self, x, x_back = None):
        if x_back is not None:
            x = torch.cat((x,x_back), dim =3)
        x = x.permute(0,3,1,2)
        b,c,h,w = x.size()

        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = torch.reshape(x5, (b, -1))
        x = self.fc(x)
        return x


class DoubleConv(nn.Module):
    """(convolution => [BN] => ReLU) * 2"""

    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
        )

    def forward(self, x):
        return self.double_conv(x)


class Down(nn.Module):
    """Downscaling with maxpool then double conv"""

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool2d(4),
            DoubleConv(in_channels, out_channels)
        )

    def forward(self, x):
        return self.maxpool_conv(x)


class Up(nn.Module):
    """Upscaling then double conv"""

    def __init__(self, in_channels, out_channels, bilinear=True):
        super().__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
            self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
        else:
            self.up = nn.ConvTranspose2d(in_channels , in_channels // 2, kernel_size=2, stride=2)
            self.conv = DoubleConv(in_channels, out_channels)


    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

================================================
FILE: dvs/printer.py
================================================
import sys

class Printer(object):
    def __init__(self, *files):
        self.files = files
        
    #Redirect Printer
    def open(self):
        if not hasattr(sys, '_stdout'):
            sys._stdout = sys.stdout
        sys.stdout = self
        return self

    #Restore the Default Printer
    def close(self):
        stdout = sys._stdout
        for f in self.files:
            if f != stdout:
                f.close()
        sys.stdout = stdout

    #Overloading write() Function
    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush()

    def flush(self):
        pass

if __name__ == '__main__':
    print("Start testing")
    t = Printer(sys.stdout, open('./test.txt', 'w+')).open()
    print("In files")
    t.close()
    print("Not in files")

================================================
FILE: dvs/requirements.txt
================================================
colorama==0.4.4
ffmpeg==1.4
imageio==2.9.0
matplotlib==3.3.4
opencv-contrib-python==4.5.1.48
opencv-python==4.5.1.48
pytz==2021.1
PyYAML==5.4.1
scipy==1.5.4
tensorboardX==2.1
tqdm==4.59.0

================================================
FILE: dvs/train.py
================================================
import os
import sys
import torch
import torchvision
import torch.nn as nn
from torch.autograd import Variable

import time
import yaml
import argparse
import numpy as np
from printer import Printer
from dataset import get_data_loader
from model import Model
import datetime
import copy
from util import make_dir, get_optimizer, AverageMeter, save_train_info, norm_flow
from gyro import torch_QuaternionProduct, torch_QuaternionReciprocal, torch_norm_quat

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def run_epoch(model, loader, cf, epoch, lr, optimizer=None, is_training=True, USE_CUDA=True, clip_norm=0):
    no_flo = False
    number_virtual, number_real = cf['data']["number_virtual"], cf['data']["number_real"]
    avg_loss = AverageMeter()
    if is_training:
        model.net.train()
        model.unet.train()
    else:
        model.net.eval()
        model.unet.eval()
    for i, data in enumerate(loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        real_inputs, times, flo, flo_back, real_projections, real_postion, ois, real_queue_idx = data
        print("Fininsh Load data")

        real_inputs = real_inputs.type(torch.float) #[b,60,84=21*4]
        real_projections = real_projections.type(torch.float) 
        flo = flo.type(torch.float) 
        flo_back = flo_back.type(torch.float) 
        ois = ois.type(torch.float) 

        batch_size, step, dim = real_inputs.size()
        times = times.numpy()
        real_queue_idx = real_queue_idx.numpy()
        virtual_queue = loader.dataset.random_init_virtual_queue(batch_size, real_postion[:,0,:].numpy(), times[:,1]) # TODO
        # virtual_queue = [None] * batch_size
        loss = 0
        model.net.init_hidden(batch_size)
        for j in range(step):
            virtual_inputs, vt_1 = loader.dataset.get_virtual_data(
                virtual_queue, real_queue_idx, times[:, j], times[:, j+1], times[:, 0], batch_size, number_virtual, real_postion[:,j]) 
            
            real_inputs_step = real_inputs[:,j,:]
            inputs = torch.cat((real_inputs_step,virtual_inputs), dim = 1) 

            # inputs = Variable(real_inputs_step)
            if USE_CUDA:
                real_inputs_step = real_inputs_step.cuda()
                virtual_inputs = virtual_inputs.cuda()
                inputs = inputs.cuda()
                if no_flo is False:
                    flo_step = flo[:,j].cuda()
                    flo_back_step = flo_back[:,j].cuda()
                else:
                    flo_step = None
                    flo_back_step = None
                vt_1 = vt_1.cuda()
                real_projections_t = real_projections[:,j+1].cuda()
                real_projections_t_1 = real_projections[:,j].cuda()
                real_postion_anchor = real_postion[:,j].cuda()
                ois_step = ois[:,j].cuda()

            if no_flo is False:
                b, h, w, _ = flo_step.size()
                flo_step = norm_flow(flo_step, h, w)
                flo_back_step = norm_flow(flo_back_step, h, w)

            if is_training:
                if no_flo is False:
                    flo_out = model.unet(flo_step, flo_back_step)
                else:
                    flo_out = None

                if j < 1:
                    for i in range(2):
                        out = model.net(inputs, flo_out, ois_step)
                else:
                    out = model.net(inputs, flo_out, ois_step)
            else:
                with torch.no_grad():
                    if no_flo is False:
                        flo_out = model.unet(flo_step, flo_back_step)
                    else:
                        flo_out = None

                    if j < 1:
                        for i in range(2):
                            out = model.net(inputs, flo_out, ois_step)
                    else:
                        out = model.net(inputs, flo_out, ois_step)

            if epoch <= 30:
                follow = True
            else:
                follow = False

            if epoch > 30:
                undefine = True
            else:
                undefine = False

            if epoch > 40:
                optical = True
            else:
                optical = False
            
            loss_step = model.loss(out, vt_1, virtual_inputs, real_inputs_step, \
                flo_step, flo_back_step, real_projections_t, real_projections_t_1, real_postion_anchor, \
                follow = follow, undefine = undefine, optical = optical, stay = optical)

            loss = loss_step
            
            virtual_position = virtual_inputs[:, -4:]
            pos = torch_QuaternionProduct(virtual_position, real_postion_anchor)
            out = torch_QuaternionProduct(out, pos)

            if USE_CUDA:
                out = out.cpu().detach().numpy() 

            virtual_queue = loader.dataset.update_virtual_queue(batch_size, virtual_queue, out, times[:,j+1])

            if (j+1) % 10 == 0:
                print("Step: "+str(j+1)+"/"+str(step))
                print(loss)
            loss = torch.sum(loss)
            if is_training:
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                if clip_norm:
                    nn.utils.clip_grad_norm_(model.net.parameters(), max_norm=clip_norm)
                    nn.utils.clip_grad_norm_(model.unet.parameters(), max_norm=clip_norm)
                optimizer.step()

            avg_loss.update(loss.item(), batch_size) 
    
    return avg_loss.avg


def train(args = None):
    torch.autograd.set_detect_anomaly(True)
    config_file = args.config
    cf = yaml.load(open(config_file, 'r'))
    
    USE_CUDA = cf['data']["use_cuda"]
    seed = cf['train']["seed"]
    
    torch.manual_seed(seed)
    if USE_CUDA:
        torch.cuda.manual_seed(seed)

    checkpoints_dir = cf['data']['checkpoints_dir']
    epochs = cf["train"]["epoch"]
    snapshot = cf["train"]["snapshot"]
    decay_epoch = cf['train']['decay_epoch']
    init_lr = cf["train"]["init_lr"]
    lr_decay = cf["train"]["lr_decay"]
    lr_step = cf["train"]["lr_step"]
    clip_norm = cf["train"]["clip_norm"]
    load_model = cf["model"]["load_model"]

    checkpoints_dir = make_dir(checkpoints_dir, cf)

    if load_model is None:
        log_file = open(os.path.join(cf["data"]["log"], cf['data']['exp']+'.log'), 'w+')
    else:
        log_file = open(os.path.join(cf["data"]["log"], cf['data']['exp']+'.log'), 'a')
    printer = Printer(sys.stdout, log_file).open()
    
    print('----Print Arguments Setting------') 
    for key in cf:
        print('{}:'.format(key))
        for para in cf[key]:
            print('{:50}:{}'.format(para,cf[key][para]))
        print('\n')

    # Define the model
    model = Model(cf) 
    optimizer = get_optimizer(cf["train"]["optimizer"], model, init_lr, cf)

    for idx, m in enumerate(model.net.children()):
        print('{}:{}'.format(idx,m))
    for idx, m in enumerate(model.unet.children()):
        print('{}:{}'.format(idx,m))

    if load_model is not None:
        print("------Load Pretrined Model--------")
        checkpoint = torch.load(load_model)
        model.net.load_state_dict(checkpoint['state_dict'])
        model.unet.load_state_dict(checkpoint['unet'])
        print("------Resume Training Process-----")
        optimizer.load_state_dict(checkpoint['optim_dict'])
        epoch_load = checkpoint['epoch']
        print("Epoch load: ", epoch_load)
    else:
        epoch_load = 0
                
    if USE_CUDA:
        model.net.cuda()
        model.unet.cuda()
        if load_model is not None:
            for state in optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.cuda()
            for param in optimizer.param_groups:
                init_lr = param['lr']

    print("-----------Load Dataset----------")
    train_loader, test_loader = get_data_loader(cf, no_flo = False)

    print("----------Start Training----------")
    currentDT = datetime.datetime.now()
    print(currentDT.strftime(" %Y-%m-%d %H:%M:%S"))
    
    start_time = time.time()

    if lr_step:
        decay_epoch = list(range(1+lr_step, epochs+1, lr_step))
    
    lr = init_lr

    for count in range(epoch_load+1, epochs+1):
        if decay_epoch != None and count in decay_epoch:
            lr *= lr_decay
            for param in optimizer.param_groups:
                param['lr'] *= lr_decay
        
        print("Epoch: %d, learning_rate: %.5f" % (count, lr))

        train_loss = run_epoch(model, train_loader, cf, count, lr, optimizer=optimizer, clip_norm=clip_norm, is_training=True, USE_CUDA=USE_CUDA)

        test_loss = run_epoch(model, test_loader, cf, count, lr, is_training=False, USE_CUDA=USE_CUDA)

        time_used = (time.time() - start_time) / 60
        print("Epoch %d done | TrLoss: %.4f | TestLoss: %.4f | Time_used: %.4f minutes" % (
            count, train_loss,  test_loss, time_used))

        if count % snapshot == 0:
            save_train_info("epoch", checkpoints_dir, cf, model, count, optimizer)
            save_train_info("last", checkpoints_dir, cf, model, count, optimizer)
            print("Model stored at epoch %d"%count)

    currentDT = datetime.datetime.now()
    print(currentDT.strftime(" %Y-%m-%d %H:%M:%S"))
    print("------------End Training----------")
    return 

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Training model")
    parser.add_argument("--config", default="./conf/stabilzation_train.yaml", help="Config file.")
    args = parser.parse_args()
    train(args = args)

================================================
FILE: dvs/util.py
================================================
import os
import torch
import cv2
from itertools import chain
from warp import load_video, save_video
import numpy as np
import matplotlib.pyplot as plt
from gyro import get_rotations
import shutil

def save_train_info(name, checkpoints_dir, cf, model, count, optimizer = None):
    path = None
    if name == "last":
        path = os.path.join(checkpoints_dir, cf['data']['exp']+'_last.checkpoint')
    elif name == "best":
        path = os.path.join(checkpoints_dir, cf['data']['exp']+'_best.checkpoint')
    else:
        path = os.path.join(checkpoints_dir, cf['data']['exp']+'_epoch%d.checkpoint'%count)
    torch.save(model.save_checkpoint(epoch = count, optimizer=optimizer), path)

def make_dir(checkpoints_dir ,cf):
    inference_path = "./test"
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)
    if not os.path.exists(cf["data"]["log"]):
        os.makedirs(cf["data"]["log"])
    if not os.path.exists(inference_path):
        os.makedirs(inference_path)
        
    inference_path = os.path.join(inference_path, cf['data']['exp'])
    if not os.path.exists(inference_path):
        os.makedirs(inference_path)
    checkpoints_dir = os.path.join(checkpoints_dir, cf['data']['exp'])
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)
    return checkpoints_dir

def get_optimizer(optimizer, model, init_lr, cf):
    if optimizer == "adam":
        optimizer = torch.optim.Adam(chain(model.net.parameters(), model.unet.parameters()), lr=init_lr, weight_decay=cf["train"]["weight_decay"])
    elif optimizer == "sgd":
        optimizer = torch.optim.SGD(chain(model.net.parameters(), model.unet.parameters()), lr=init_lr, momentum=cf["train"]["momentum"])
    return optimizer

def crop_video(in_path, out_path, crop_ratio):
    frame_array, fps, size = load_video(in_path)
    hs = int((1-crop_ratio)*1080) + 1
    he = int(crop_ratio*1080) - 1
    ws = int((1-crop_ratio)*1920) + 1
    we = int(crop_ratio*1920) - 1
    for i in range(len(frame_array)):
        frame_array[i] = cv2.resize(frame_array[i][hs:he,ws:we,:], size, interpolation = cv2.INTER_LINEAR)
    save_video(out_path, frame_array, fps, size= size)

def norm_flow(flow, h, w):
    if flow.shape[2] == 2:
        flow[:,:,0] /= h
        flow[:,:,1] /= w
    else:
        flow[:,:,:,0] /= h
        flow[:,:,:,1] /= w
    return flow

class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.cnt += n
        if self.cnt > 0:
            self.avg = self.sum / self.cnt

================================================
FILE: dvs/warp/__init__.py
================================================
from .warping import (
    warp_video
    )
from .read_write import (
    save_video,
    load_video,
    video2frame_one_seq
    )

================================================
FILE: dvs/warp/rasterizer.py
================================================
import numpy as np
import matplotlib.pyplot as plt
from numpy import array
import torch
import cv2
import time

device = torch.device("cuda")

def Rasterization(image, grid, get_mesh_only = False):
    # grid xy WH
    shape = image.size()
    height = shape[1]
    width = shape[2]
    wapper_upper_triangle, wapper_lower_triangle = grid_to_triangle(grid[:,:,:2])
    origin_upper_triangle, origin_lower_triangle = grid_to_triangle(grid[:,:,2:])


    [xmax, xmin, ymax, ymin], xlength, ylength = grid_size(wapper_upper_triangle, wapper_lower_triangle, height, width)

    xratio = xlength / width
    yratio = ylength / height

    wapper_triangle = torch.stack((wapper_upper_triangle,wapper_lower_triangle),dim = 1).to(device) # grid * upper/lower * point * xy
    origin_triangle = torch.stack((origin_upper_triangle,origin_lower_triangle),dim = 1).to(device) # grid * upper/lower * point * xy

    tran_triangle = torch.zeros(wapper_triangle.size()).to(device)

    tran_triangle[:,:,:,0] = (wapper_triangle[:,:,:,0] - xmin.view(-1,1,1).to(device)/width) / xratio
    tran_triangle[:,:,:,1] = (wapper_triangle[:,:,:,1] - ymin.view(-1,1,1).to(device)/height) / yratio

    mask = triangle2mask(tran_triangle, ylength, xlength) # consuming

    mask = torch.unsqueeze(mask, 4)
    origin_triangle = torch.unsqueeze(origin_triangle, 1)

    grid_sample = origin_triangle * mask # consuming
    grid_sample = torch.sum(torch.sum(grid_sample, dim = 3), dim = 2).view(-1,ylength,xlength,2) # consuming

    gxmin = min(0, int(torch.min(xmin)))
    gxmax = int(torch.max(xmin) + xlength)
    gymin = min(0, int(torch.min(ymin)))
    gymax = int(torch.max(ymin) + ylength)
    grid_merge = torch.zeros((max(gymax-gymin, height, height - gymin),max(gxmax - gxmin, width, width - gxmin),2)).to(device)
    for i in range(grid_sample.size()[0]):
        x_s = int(xmin[i] - gxmin)
        x_e = int(xmin[i] + xlength - gxmin)
        y_s = int(ymin[i] - gymin)
        y_e = int(ymin[i] + ylength -gymin)
        grid_merge[ y_s:y_e, x_s:x_e, :] += grid_sample[i, :, :, :]

    # grid_merge = grid_merge[min(-gxmin,0):min(-gxmin,0)+height, min(-gymin,0):min(-gymin,0)+width, :] 
    grid_merge = grid_merge[-gymin:-gymin+height, -gxmin:-gxmin+width, :] 
    # if get_mesh_only:
    #     grid_merge = grid_merge.cpu().numpy()
    #     mesh_grid = generate_mesh_grid(height, width)
    #     out = grid_merge - mesh_grid
    #     return np.concatenate((out[:,:,1:],out[:,:,:1]),2)
    
    shift = torch.tensor([0.5/height,0.5/width])[None, None, :].to(device)
    grid_merge = (grid_merge + 1*shift) * 2 - 1

    image[:3,:2,:2] = 0

    image = torch.unsqueeze(image, 0).to(device)
    grid_merge = torch.unsqueeze(grid_merge, 0)

    image = torch.nn.functional.grid_sample(image, grid_merge) # default bilinear

    image = torch.squeeze(image, 0)
    return image.cpu()

def grid_to_triangle(grid):
    grid_shape = grid.size()
    num = (grid_shape[0] - 1) * (grid_shape[1] - 1)

    upper_triangle = grid[:-1, :-1, :, None]
    upper_triangle = torch.cat(( upper_triangle, grid[1:, :-1, :, None]), dim = 3)
    upper_triangle = torch.cat(( upper_triangle, grid[:-1, 1:, :, None]), dim = 3)
    upper_triangle = upper_triangle.view(num, 2, 3)
    upper_triangle = torch.transpose(upper_triangle, 1, 2) # grid * point * xy
 
    lower_triangle = grid[:-1, 1:, :, None]
    lower_triangle = torch.cat(( lower_triangle, grid[1:, :-1, :, None]), dim = 3)
    lower_triangle = torch.cat(( lower_triangle, grid[1:, 1:, :, None]), dim = 3)
    lower_triangle = lower_triangle.view(num, 2, 3)
    lower_triangle = torch.transpose(lower_triangle, 1, 2)
    
    return upper_triangle,  lower_triangle # grid * point * xy

def grid_size(upper_triangle, lower_triangle, height, width):
    wapper_grid = torch.cat((upper_triangle, lower_triangle),dim =1)
    xmax = torch.floor(torch.max(wapper_grid[:,:,0]*width, 1)[0]) + 1
    ymax = torch.floor(torch.max(wapper_grid[:,:,1]*height, 1)[0]) + 1
    xmin = torch.floor(torch.min(wapper_grid[:,:,0]*width, 1)[0])
    ymin = torch.floor(torch.min(wapper_grid[:,:,1]*height, 1)[0])

    xlength = int(torch.max(xmax - xmin))
    ylength = int(torch.max(ymax - ymin))

    return [xmax, xmin, ymax, ymin], xlength, ylength

def generate_mesh_grid(height, width):
    # Create a grid of sampling positions
    xs = np.linspace(0, 1, width, endpoint=False)
    ys = np.linspace(0, 1, height, endpoint=False)
    xmesh, ymesh = np.meshgrid(xs, ys)
    # Reshape the sampling positions to a H x W x 2 tensor
    return np.moveaxis(array(list(zip(xmesh, ymesh))), 1, 2)

def triangle2mask(d, height, width): # d: [N x T x 3 x 2]
    N = d.size()[0] # batch size
    T = d.size()[1] # triangle number
    P = height * width # The number of pixels in the output image.

    area = edgefunc(d[:, :, 1, :], d[:, :, 2, :], d[:, :, None, 0, :])

    gridcpu = generate_mesh_grid(height, width)
    
    gridcpu = np.reshape(gridcpu, (height*width, 2))

    grid = torch.Tensor(gridcpu)
    grid = grid.unsqueeze(0).repeat((N, T, 1, 1)) # [N x T x P x 2]

    grid = grid.to(device)

    # Evaluate the edge functions at every position.
    # We should get a [N x P] vector out of each.
    w0 = edgefunc(d[:, :, 1, :], d[:, :, 2, :], grid) / area
    w1 = edgefunc(d[:, :, 2, :], d[:, :, 0, :], grid) / area
    w2 = edgefunc(d[:, :, 0, :], d[:, :, 1, :], grid) / area

    # Only pixels inside the triangles will have color
    # [N x P]

    mask = (w0 > 0) & (w1 > 0) & (w2 > 0)
    mask = torch.unsqueeze(mask, 3).type(torch.cuda.FloatTensor)

    w = torch.stack((w0,w1,w2),dim = 3) * mask

    return torch.transpose(w, 1, 2) # [N x P x T x 3]
    

def edgefunc(v0, v1, p):
    """
    let P = H * W
    v0 and v1 have vertex positions for all T triangles.
    Their shapes are [N x T X 2]
    p is a list of sampling points as a [N x T X P x 2] tensor.
    Each of the T triangles has an [P x 2] matrix of sampling points.
    returns a [N x T x P] matrix
    """
    P = p.size()[2]
    
    # Take all the x and y coordinates of all the positions as a
    # [N x S] tensor
    py = p[:, :, :, 1]
    px = p[:, :, :, 0]

    # We need to manually broadcast the vector to cover all sample points
    x10 = v0[:, :, 0] - v1[:, :, 0] # [N x T]
    y01 = v1[:, :, 1] - v0[:, :, 1] # [N x T]

    x10 = x10.unsqueeze(2).repeat((1, 1, P)) # [N x T x P]
    y01 = y01.unsqueeze(2).repeat((1, 1, P)) # [N x T x P]

    cross = v0[:,:,1]*v1[:,:,0] - v0[:,:,0]*v1[:,:,1] # [N x T]
    cross = cross.unsqueeze(2).repeat((1, 1, P)) # [N x T x P]

    return y01*px + x10*py + cross

if __name__ == '__main__':
    print(generate_mesh_grid(2,3))

================================================
FILE: dvs/warp/read_write.py
================================================
import numpy as np
import cv2
import os
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import ffmpeg  
import json
import torch 
import argparse
    
def load_video(path, save_dir = None, resize = None, length = -1): # N x H x W x C
    vidcap = cv2.VideoCapture(path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    success,image = vidcap.read()
    print(image.shape)
    height, width, layers = image.shape
    if resize is None:
        size = (width,height)
    elif type(resize) is int:
        size = (width//resize,height//resize)
    else:
        size = resize
    count = 0
    frames = []
    while success:  
        if resize is not None:
            image = cv2.resize(image, size, interpolation = cv2.INTER_LINEAR)
        if save_dir != None:
            path = os.path.join(save_dir, "frame_" + str(count).zfill(4) + ".png")
            cv2.imwrite(path, image) 
        frames.append(image)
        success,image = vidcap.read()
        count += 1
        if length > 0 and count >= length:
            break
    print("Video length: ", len(frames))
    return frames, fps, size

def video2frame(path, resize = None):
    data_name = sorted(os.listdir(path))
    for i in range(len(data_name)):
        print(str(i+1)+" / " + str(len(data_name)))
        data_folder = os.path.join(path, data_name[i])
        print(data_folder)
        files = os.listdir(data_folder)
        for f in files:
            if f[-4:] == ".mp4":
                video_name = f
        video_path = os.path.join(data_folder, video_name)
        frame_folder = os.path.join(data_folder, "frames")
        if not os.path.exists(frame_folder):
            os.makedirs(frame_folder)
        load_video(video_path, save_dir = frame_folder, resize=resize)

def video2frame_one_seq(path, save_dir = None, resize = None): # N x H x W x C
    vidcap = cv2.VideoCapture(path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    success,image = vidcap.read()
    print(path)
    print(image.shape)	
    height, width, layers = image.shape
    if resize is None:
        size = (width,height)
    elif type(resize) is int:
        size = (width//resize,height//resize)
    else:
        size = resize
    count = 0
    while success:  
        if resize is not None:
            image = cv2.resize(image, size, interpolation = cv2.INTER_LINEAR)
        if save_dir != None:
            path = os.path.join(save_dir, "frame_" + str(count).zfill(5) + ".png")
            cv2.imwrite(path, image) 
        success,image = vidcap.read()
        count += 1
    return fps, size

def save_video(path,frame_array, fps, size, losses = None, frame_number = False, writer = None):
    if writer is None:
        if path[-3:] == "mp4":
            out = cv2.VideoWriter(path,cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
        else:
            out = cv2.VideoWriter(path,cv2.VideoWriter_fourcc('M','J','P','G'), fps, size)
    else:
        out = writer
    for i in range(len(frame_array)):
        # writing to a image array
        if frame_number:
            frame_array[i] = draw_number(np.asarray(frame_array[i]), i)
        if losses is not None:
            frame_array[i] = draw_number(np.asarray(frame_array[i]), losses[i], x = 900, message = "Loss: ")
        out.write(frame_array[i])
    if writer is None:
        out.release()

def draw_number(frame, num, x = 10, y = 10, message = "Frame: "):
    image=Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(image)
    font = ImageFont.truetype("./data/arial.ttf", 45)
     
    message = message + str(num)
    color = 'rgb(0, 0, 0)' # black color
    
    draw.text((x, y), message, fill=color, font=font)
    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

if __name__ == "__main__":
    parser = argparse.ArgumentParser("FlowNet2 Preparation")
    parser.add_argument("--dir_path", default="./video")
    args = parser.parse_args()
    dir_path = args.dir_path
    if dir_path == "./video":
        video2frame(dir_path, resize = 4)
    else:
        video2frame(os.path.join(dir_path, "test"), resize = 4)
        video2frame(os.path.join(dir_path, "training"), resize = 4)

================================================
FILE: dvs/warp/warping.py
================================================
import numpy as np
from .read_write import load_video, save_video
import torch
import cv2
from .rasterizer import Rasterization
import time
import os

def warp_video(mesh_path, video_path, save_path, losses = None, frame_number = False, fps_fix = None):
    if type(mesh_path) == str:
        print("Error")
    else:
        grid_data = mesh_path

    frame_array, fps, size = load_video(video_path, length = grid_data.shape[0])
    if fps_fix is not None:
        fps = fps_fix
    length = min(grid_data.shape[0], len(frame_array))
    seq_length = 100
    seq = length//seq_length
    writer = cv2.VideoWriter(save_path,cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
    for i in range(seq+1):
        if seq_length*i==length:
            break
        print("Frame: "+str(i*seq_length)+"/"+str(length))
        frame_array_save = warpping_rast(grid_data[seq_length*i:min(seq_length*(i+1),length)], frame_array[seq_length*i:min(seq_length*(i+1),length)], losses = losses)
        save_video(save_path,frame_array_save, fps, size, losses = losses, frame_number = frame_number, writer = writer)
    writer.release()

def warpping_rast(grid_data, frame_array, losses = None):
    output = []
    for i in range(0, min(len(frame_array), grid_data.shape[0])):
        frame = warpping_one_frame_rast(frame_array[i], grid_data[i])
        output.append(frame)
    return output

def warpping_one_frame_rast(image, grid):
    img = torch.Tensor(image).permute(2,0,1)/255
    grid = torch.Tensor(grid)
    output_image = Rasterization(img, grid)
    return np.clip(output_image.permute(1,2,0).numpy() * 255, 0, 255).astype("uint8")