Repository: lihxxx/DisPose
Branch: main
Commit: c38ab1329c75
Files: 101
Total size: 382.0 KB

Directory structure:
gitextract_7etsfbwj/

├── .gitignore
├── README.md
├── configs/
│   └── test.yaml
├── constants.py
├── inference_ctrl.py
├── mimicmotion/
│   ├── __init__.py
│   ├── dwpose/
│   │   ├── .gitignore
│   │   ├── __init__.py
│   │   ├── dwpose_detector.py
│   │   ├── onnxdet.py
│   │   ├── onnxpose.py
│   │   ├── preprocess.py
│   │   ├── util.py
│   │   └── wholebody.py
│   ├── modules/
│   │   ├── __init__.py
│   │   ├── attention.py
│   │   ├── cmp/
│   │   │   ├── experiments/
│   │   │   │   ├── rep_learning/
│   │   │   │   │   ├── alexnet_yfcc+youtube_voc_16gpu_140k/
│   │   │   │   │   │   ├── config.yaml
│   │   │   │   │   │   ├── resume.sh
│   │   │   │   │   │   ├── resume_slurm.sh
│   │   │   │   │   │   ├── train.sh
│   │   │   │   │   │   ├── train_slurm.sh
│   │   │   │   │   │   ├── validate.sh
│   │   │   │   │   │   └── validate_slurm.sh
│   │   │   │   │   ├── alexnet_yfcc_voc_16gpu_70k/
│   │   │   │   │   │   ├── config.yaml
│   │   │   │   │   │   ├── resume.sh
│   │   │   │   │   │   ├── resume_slurm.sh
│   │   │   │   │   │   ├── train.sh
│   │   │   │   │   │   ├── train_slurm.sh
│   │   │   │   │   │   ├── validate.sh
│   │   │   │   │   │   └── validate_slurm.sh
│   │   │   │   │   ├── alexnet_yfcc_voc_8gpu_140k/
│   │   │   │   │   │   ├── config.yaml
│   │   │   │   │   │   ├── resume.sh
│   │   │   │   │   │   ├── resume_slurm.sh
│   │   │   │   │   │   ├── train.sh
│   │   │   │   │   │   ├── train_slurm.sh
│   │   │   │   │   │   ├── validate.sh
│   │   │   │   │   │   └── validate_slurm.sh
│   │   │   │   │   ├── resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/
│   │   │   │   │   │   ├── config.yaml
│   │   │   │   │   │   ├── resume.sh
│   │   │   │   │   │   ├── resume_slurm.sh
│   │   │   │   │   │   ├── train.sh
│   │   │   │   │   │   ├── train_slurm.sh
│   │   │   │   │   │   ├── validate.sh
│   │   │   │   │   │   └── validate_slurm.sh
│   │   │   │   │   ├── resnet50_yfcc_coco_16gpu_42k/
│   │   │   │   │   │   ├── config.yaml
│   │   │   │   │   │   ├── resume.sh
│   │   │   │   │   │   ├── resume_slurm.sh
│   │   │   │   │   │   ├── train.sh
│   │   │   │   │   │   ├── train_slurm.sh
│   │   │   │   │   │   ├── validate.sh
│   │   │   │   │   │   └── validate_slurm.sh
│   │   │   │   │   └── resnet50_yfcc_voc_16gpu_42k/
│   │   │   │   │       ├── config.yaml
│   │   │   │   │       ├── resume.sh
│   │   │   │   │       ├── resume_slurm.sh
│   │   │   │   │       ├── train.sh
│   │   │   │   │       ├── train_slurm.sh
│   │   │   │   │       ├── validate.sh
│   │   │   │   │       └── validate_slurm.sh
│   │   │   │   └── semiauto_annot/
│   │   │   │       └── resnet50_vip+mpii_liteflow/
│   │   │   │           ├── config.yaml
│   │   │   │           ├── resume.sh
│   │   │   │           ├── resume_slurm.sh
│   │   │   │           ├── train.sh
│   │   │   │           ├── train_slurm.sh
│   │   │   │           ├── validate.sh
│   │   │   │           └── validate_slurm.sh
│   │   │   ├── losses.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── backbone/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── alexnet.py
│   │   │   │   │   └── resnet.py
│   │   │   │   ├── cmp.py
│   │   │   │   ├── modules/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── cmp.py
│   │   │   │   │   ├── decoder.py
│   │   │   │   │   ├── others.py
│   │   │   │   │   ├── shallownet.py
│   │   │   │   │   └── warp.py
│   │   │   │   └── single_stage_model.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── common_utils.py
│   │   │       ├── data_utils.py
│   │   │       ├── distributed_utils.py
│   │   │       ├── flowlib.py
│   │   │       ├── scheduler.py
│   │   │       └── visualize_utils.py
│   │   ├── cmp_model.py
│   │   ├── controlnet.py
│   │   ├── point_adapter.py
│   │   ├── pose_net.py
│   │   └── unet.py
│   ├── pipelines/
│   │   ├── pipeline_ctrl.py
│   │   └── pipeline_mimicmotion.py
│   └── utils/
│       ├── __init__.py
│       ├── dift_utils.py
│       ├── flow_utils.py
│       ├── geglu_patch.py
│       ├── loader.py
│       ├── utils.py
│       └── visualizer.py
├── requirements.txt
└── scripts/
    └── test.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
#/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# custom ignores
.DS_Store
_.*

# models and outputs
# models/
outputs/

# Replicate
.cog

================================================
FILE: README.md
================================================
# [ICLR2025] DisPose: Disentangling Pose Guidance for Controllable Human Image Animation
This repository is the official implementation of [DisPose](https://arxiv.org/abs/2412.09349).

[![arXiv](https://img.shields.io/badge/arXiv-2412.09349-b31b1b.svg)](https://arxiv.org/abs/2412.09349)
[![Project Page](https://img.shields.io/badge/Project-Website-green)](https://lihxxx.github.io/DisPose/)

## 🔥 News
- **`2025/01/23`**: DisPose is accepted to ICLR 2025.
- **`2024/12/13`**: We have released the inference code and the checkpoints for DisPose.
  
**📖 Table of Contents**
- [DisPose: Disentangling Pose Guidance for Controllable Human Image Animation](#dispose-disentangling-pose-guidance-for-controllable-human-image-animation)
  - [🎨 Gallery](#-gallery)
  - [🧙 Method Overview](#-method-overview)
  - [🔧 Preparations](#-preparations)
    - [Setup repository and conda environment](#setup-repository-and-conda-environment)
    - [Prepare model weights](#prepare-model-weights)
  - [💫 Inference](#-inference)
    - [Tips](#tips)
  - [📣 Disclaimer](#-disclaimer)
  - [💞 Acknowledgements](#-acknowledgements)
  - [🔍 Citation](#-citation)

## 🎨 Gallery
<table class="center">
<tr>
  <td><video src="https://github.com/user-attachments/assets/e2f5e263-3f86-4778-98b9-6d2d451b7516" autoplay></td>
  <td><video src="https://github.com/user-attachments/assets/f8e761e3-7a7a-4812-ad61-023b33034a42" autoplay></td>
  <td><video src="https://github.com/user-attachments/assets/9a6c7ea6-8c73-4a50-b594-f8eba239c405" autoplay></td>
  <td><video src="https://github.com/user-attachments/assets/a0f97ac4-429e-4ca9-a794-7c02b5dc5405" autoplay></td>
  <td><video src="https://github.com/user-attachments/assets/6e9d463c-f7c5-4de8-924b-1ad591e3a9a4" autoplay></td>
</tr>
</table>

## 🧙 Method Overview
We present **DisPose** to mine more generalizable and effective control signals without additional dense input, which disentangles the sparse skeleton pose in human image animation into motion field guidance and keypoint correspondence.
<div align='center'>
<img src="https://anonymous.4open.science/r/DisPose-AB1D/pipeline.png" class="interpolation-image" alt="comparison." height="80%" width="80%" />
</div>


## 🔧 Preparations
### Setup repository and conda environment
The code requires `python>=3.10`, as well as `torch>=2.0.1` and `torchvision>=0.15.2`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. The demo has been tested on CUDA version of 12.4.
```
conda create -n dispose python==3.10
conda activate dispose
pip install -r requirements.txt
```

### Prepare model weights
1. Download the weights of  [DisPose](https://huggingface.co/lihxxx/DisPose) and put `DisPose.pth` into `./pretrained_weights/`.

2. Download the weights of other components and put them into `./pretrained_weights/`:
  - [stable-video-diffusion-img2vid-xt-1-1](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1/tree/main)
  - [stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main)
  - [dwpose](https://huggingface.co/yzd-v/DWPose/tree/main)
  - [MimicMotion](https://huggingface.co/tencent/MimicMotion/tree/main)
3. Download the weights of [CMP](https://huggingface.co/MyNiuuu/MOFA-Video-Hybrid/resolve/main/models/cmp/experiments/semiauto_annot/resnet50_vip%2Bmpii_liteflow/checkpoints/ckpt_iter_42000.pth.tar) and put it into `./mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/checkpoints`

Finally, these weights should be organized in `./pretrained_weights/`. as follows:


```
./pretrained_weights/
|-- MimicMotion_1-1.pth
|-- DisPose.pth
|-- dwpose
|   |-- dw-ll_ucoco_384.onnx
|   └── yolox_l.onnx
|-- stable-diffusion-v1-5
|-- stable-video-diffusion-img2vid-xt-1-1
```

## 💫 Inference

A sample configuration for testing is provided as `test.yaml`. You can also easily modify the various configurations according to your needs.

```
bash scripts/test.sh 
```

### Tips
- If your GPU memory is limited, try set `decode_chunk_size` in `test.yaml` to 1.
- If you want to enhance the quality of the generated video, you could try some post-processing such as face swapping ([insightface](https://github.com/deepinsight/insightface)) and frame interpolation ([IFRNet](https://github.com/ltkong218/IFRNet)).

## 📣 Disclaimer
This is official code of DisPose.
All the copyrights of the demo images and videos are from community users. 
Feel free to contact us if you would like to remove them.

## 💞 Acknowledgements
We sincerely appreciate the code release of the following projects: [MimicMotion](https://github.com/Tencent/MimicMotion), [Moore-AnimateAnyone](https://github.com/MooreThreads/Moore-AnimateAnyone), [CMP](https://github.com/XiaohangZhan/conditional-motion-propagation).

## 🔍 Citation

```
@inproceedings{
li2025dispose,
title={DisPose: Disentangling Pose Guidance for Controllable Human Image Animation},
author={Hongxiang Li and Yaowei Li and Yuhang Yang and Junjie Cao and Zhihong Zhu and Xuxin Cheng and Long Chen},
booktitle={The Thirteenth International Conference on Learning Representations},
year={2025},
url={https://openreview.net/forum?id=AumOa10MKG}
}
```


================================================
FILE: configs/test.yaml
================================================
# base svd model path
base_model_path: ./pretrained_weights/stable-video-diffusion-img2vid-xt-1-1
# base dift model path
dift_model_path: ./pretrained_weights/stable-diffusion-v1-5

# checkpoint path
ckpt_path: ./pretrained_weights/MimicMotion_1-1.pth
controlnet_path: ./pretrained_weights/DisPose.pth

test_case:
  - ref_video_path: ./assets/example_data/videos/video1.mp4
    ref_image_path: ./assets/example_data/images/ref1.png
    num_frames: 16
    resolution: 576
    frames_overlap: 6
    num_inference_steps: 25
    noise_aug_strength: 0
    guidance_scale: 2.0
    sample_stride: 2
    decode_chunk_size: 8
    fps: 15
    seed: 42

  - ref_video_path: ./assets/example_data/videos/video2.mp4
    ref_image_path: ./assets/example_data/images/ref2.png
    num_frames: 16
    resolution: 576
    frames_overlap: 6
    num_inference_steps: 25
    noise_aug_strength: 0
    guidance_scale: 2.0
    sample_stride: 2
    decode_chunk_size: 8
    fps: 15
    seed: 42

  - ref_video_path: ./assets/example_data/videos/video3.mp4
    ref_image_path: ./assets/example_data/images/ref3.png
    num_frames: 16
    resolution: 576
    frames_overlap: 6
    num_inference_steps: 25
    noise_aug_strength: 0
    guidance_scale: 2.0
    sample_stride: 2
    decode_chunk_size: 8
    fps: 15
    seed: 42

================================================
FILE: constants.py
================================================
# w/h apsect ratio
ASPECT_RATIO = 9 / 16


================================================
FILE: inference_ctrl.py
================================================
import os
import argparse
import logging
import math
from omegaconf import OmegaConf
from datetime import datetime
import time
from pathlib import Path
import PIL.Image
import numpy as np
import torch.jit
from torchvision.datasets.folder import pil_loader
from torchvision.transforms.functional import pil_to_tensor, resize, center_crop
from torchvision.transforms.functional import to_pil_image
from torchvision import transforms
import torch.nn.functional as F
from torchvision.transforms import PILToTensor
import torchvision

import decord
from einops import rearrange, repeat
from mimicmotion.utils.dift_utils import SDFeaturizer
from mimicmotion.utils.utils import points_to_flows, bivariate_Gaussian, sample_inputs_flow, get_cmp_flow, pose2track
from  mimicmotion.utils.visualizer import Visualizer, vis_flow_to_video
import cv2


from mimicmotion.utils.geglu_patch import patch_geglu_inplace
patch_geglu_inplace()

from constants import ASPECT_RATIO
from mimicmotion.utils.loader import create_ctrl_pipeline
from mimicmotion.utils.utils import save_to_mp4
from mimicmotion.dwpose.preprocess import get_video_pose, get_image_pose
from mimicmotion.modules.cmp_model import CMP


import pdb
logging.basicConfig(level=logging.INFO, format="%(asctime)s: [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def preprocess(video_path, image_path, dift_model_path, resolution=576, sample_stride=2):
    """preprocess ref image pose and video pose

    Args:
        video_path (str): input video pose path
        image_path (str): reference image path
        resolution (int, optional):  Defaults to 576.
        sample_stride (int, optional): Defaults to 2.
    """
    image_pixels = pil_loader(image_path)
    image_pixels = pil_to_tensor(image_pixels) # (c, h, w)
    h, w = image_pixels.shape[-2:]
    ############################ compute target h/w according to original aspect ratio ###############################
    if h>w:
        w_target, h_target = resolution, int(resolution / ASPECT_RATIO // 64) * 64
    elif h==w:
        w_target, h_target = resolution, resolution
    else:
        w_target, h_target = int(resolution / ASPECT_RATIO // 64) * 64, resolution
    h_w_ratio = float(h) / float(w)
    if h_w_ratio < h_target / w_target:
        h_resize, w_resize = h_target, math.ceil(h_target / h_w_ratio)
    else:
        h_resize, w_resize = math.ceil(w_target * h_w_ratio), w_target
    image_pixels = resize(image_pixels, [h_resize, w_resize], antialias=None)
    image_pixels = center_crop(image_pixels, [h_target, w_target])
    # h_target, w_target = image_pixels.shape[-2:]
    image_pixels = image_pixels.permute((1, 2, 0)).numpy()
    ##################################### get video flow #################################################
    transform = transforms.Compose(
        [
        
        transforms.Resize((h_target, w_target), antialias=None), 
        transforms.CenterCrop((h_target, w_target)), 
        transforms.ToTensor()
        ]
    )
    
    ref_img = transform(PIL.Image.fromarray(image_pixels))

    ##################################### get image&video pose value #################################################
    image_pose, ref_point = get_image_pose(image_pixels)
    ref_point_body, ref_point_head = ref_point["bodies"], ref_point["faces"]
    video_pose, body_point, face_point = get_video_pose(video_path, image_pixels, sample_stride=sample_stride)
    body_point_list = [ref_point_body] + body_point
    face_point_list = [ref_point_head] + face_point

    pose_pixels = np.concatenate([np.expand_dims(image_pose, 0), video_pose])
    image_pixels = np.transpose(np.expand_dims(image_pixels, 0), (0, 3, 1, 2))
    
    dift_model = SDFeaturizer(sd_id = dift_model_path, weight_dtype=torch.float16)
    category="human"
    prompt = f'photo of a {category}'
    dift_ref_img = (image_pixels / 255.0 - 0.5) *2
    dift_ref_img = torch.from_numpy(dift_ref_img).to(device, torch.float16)
    dift_feats = dift_model.forward(dift_ref_img, prompt=prompt, t=[261,0], up_ft_index=[1,2], ensemble_size=8)


    model_length = len(body_point_list)
    traj_flow = points_to_flows(body_point_list, model_length, h_target, w_target)
    blur_kernel = bivariate_Gaussian(kernel_size=199, sig_x=20, sig_y=20, theta=0, grid=None, isotropic=True)

    for i in range(0, model_length-1):
        traj_flow[i] = cv2.filter2D(traj_flow[i], -1, blur_kernel)

    traj_flow = rearrange(traj_flow, "f h w c -> f c h w") 
    traj_flow = torch.from_numpy(traj_flow)
    traj_flow = traj_flow.unsqueeze(0)

    cmp = CMP(
        './mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
        42000
    ).to(device)
    cmp.requires_grad_(False)

    pc, ph, pw = ref_img.shape
    poses, poses_subset = pose2track(body_point_list, ph, pw)
    poses = torch.from_numpy(poses).permute(1,0,2)
    poses_subset = torch.from_numpy(poses_subset).permute(1,0,2)

    # pdb.set_trace()
    val_controlnet_image, val_sparse_optical_flow, \
    val_mask, val_first_frame_384, \
        val_sparse_optical_flow_384, val_mask_384 = sample_inputs_flow(ref_img.unsqueeze(0).float(), poses.unsqueeze(0), poses_subset.unsqueeze(0))

    fb, fl, fc, fh, fw = val_sparse_optical_flow.shape

    val_controlnet_flow = get_cmp_flow(
        cmp, 
        val_first_frame_384.unsqueeze(0).repeat(1, fl, 1, 1, 1).to(device), 
        val_sparse_optical_flow_384.to(device), 
        val_mask_384.to(device)
    )

    if fh != 384 or fw != 384:
        scales = [fh / 384, fw / 384]
        val_controlnet_flow = F.interpolate(val_controlnet_flow.flatten(0, 1), (fh, fw), mode='nearest').reshape(fb, fl, 2, fh, fw)
        val_controlnet_flow[:, :, 0] *= scales[1]
        val_controlnet_flow[:, :, 1] *= scales[0]
    
    vis_flow = val_controlnet_flow[0]

    return torch.from_numpy(pose_pixels.copy()) / 127.5 - 1, torch.from_numpy(image_pixels) / 127.5 - 1, val_controlnet_flow, val_controlnet_image, body_point_list, dift_feats, traj_flow


def run_pipeline(pipeline, image_pixels, pose_pixels,
                controlnet_flow, controlnet_image, point_list, dift_feats, traj_flow,
                device, task_config):
    image_pixels = [to_pil_image(img.to(torch.uint8)) for img in (image_pixels + 1.0) * 127.5]
    generator = torch.Generator(device=device)
    generator.manual_seed(task_config.seed)
    with torch.autocast("cuda"):
        frames = pipeline(
            image_pixels, image_pose=pose_pixels, num_frames=pose_pixels.size(0),
            tile_size=task_config.num_frames, tile_overlap=task_config.frames_overlap,
            height=pose_pixels.shape[-2], width=pose_pixels.shape[-1], fps=7,
            controlnet_flow=controlnet_flow, controlnet_image=controlnet_image, point_list=point_list, dift_feats=dift_feats, traj_flow=traj_flow,
            noise_aug_strength=task_config.noise_aug_strength, num_inference_steps=task_config.num_inference_steps,
            generator=generator, min_guidance_scale=task_config.guidance_scale, 
            max_guidance_scale=task_config.guidance_scale, decode_chunk_size=task_config.decode_chunk_size, output_type="pt", device=device
        ).frames.cpu()
    video_frames = (frames * 255.0).to(torch.uint8)

    for vid_idx in range(video_frames.shape[0]):
        # deprecated first frame because of ref image
        _video_frames = video_frames[vid_idx, 1:]

    return _video_frames


@torch.no_grad()
def main(args):
    if not args.no_use_float16 :
        torch.set_default_dtype(torch.float16)

    infer_config = OmegaConf.load(args.inference_config)
    pipeline = create_ctrl_pipeline(infer_config, device)

    for task in infer_config.test_case:
        ############################################## Pre-process data ##############################################
        pose_pixels, image_pixels, controlnet_flow, controlnet_image, point_list, dift_feats, traj_flow = preprocess(
            task.ref_video_path, task.ref_image_path, infer_config.dift_model_path, 
            resolution=task.resolution, sample_stride=task.sample_stride
        )
        ########################################### Run MimicMotion pipeline ###########################################
        _video_frames = run_pipeline(
            pipeline, 
            image_pixels, pose_pixels, controlnet_flow, controlnet_image, point_list, dift_feats, traj_flow,
            device, task
        )
        ################################### save results to output folder. ###########################################
        save_to_mp4(
            _video_frames, 
            f"{args.output_dir}/{datetime.now().strftime('%Y%m%d')}_{args.name}/{datetime.now().strftime('%H%M%S')}_{os.path.basename(task.ref_image_path).split('.')[0]}_to_{os.path.basename(task.ref_video_path).split('.')[0]}" \
            f"_CFG{task.guidance_scale}_{task.num_frames}_{task.fps}.mp4",
            fps=task.fps,
        )

def set_logger(log_file=None, log_level=logging.INFO):
    log_handler = logging.FileHandler(log_file, "w")
    log_handler.setFormatter(
        logging.Formatter("[%(asctime)s][%(name)s][%(levelname)s]: %(message)s")
    )
    log_handler.setLevel(log_level)
    logger.addHandler(log_handler)


if __name__ == "__main__":    
    parser = argparse.ArgumentParser()
    parser.add_argument("--log_file", type=str, default=None)
    parser.add_argument("--inference_config", type=str, default="configs/test.yaml") #ToDo
    parser.add_argument("--output_dir", type=str, default="outputs/", help="path to output")
    parser.add_argument("--name", type=str, default="")
    parser.add_argument("--no_use_float16",
                        action="store_true",
                        help="Whether use float16 to speed up inference",
    )
    args = parser.parse_args()

    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
    main(args)
    logger.info(f"--- Finished ---")


================================================
FILE: mimicmotion/__init__.py
================================================


================================================
FILE: mimicmotion/dwpose/.gitignore
================================================
*.pyc


================================================
FILE: mimicmotion/dwpose/__init__.py
================================================


================================================
FILE: mimicmotion/dwpose/dwpose_detector.py
================================================
import os

import numpy as np
import torch

from .wholebody import Wholebody

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class DWposeDetector:
    """
    A pose detect method for image-like data.

    Parameters:
        model_det: (str) serialized ONNX format model path, 
                    such as https://huggingface.co/yzd-v/DWPose/blob/main/yolox_l.onnx
        model_pose: (str) serialized ONNX format model path, 
                    such as https://huggingface.co/yzd-v/DWPose/blob/main/dw-ll_ucoco_384.onnx
        device: (str) 'cpu' or 'cuda:{device_id}'
    """
    def __init__(self, model_det, model_pose, device='cpu'):
        self.args = model_det, model_pose, device

    def release_memory(self):
        if hasattr(self, 'pose_estimation'):
            del self.pose_estimation
            import gc; gc.collect()

    def __call__(self, oriImg):
        if not hasattr(self, 'pose_estimation'):
            self.pose_estimation = Wholebody(*self.args)

        oriImg = oriImg.copy()
        H, W, C = oriImg.shape
        with torch.no_grad():
            candidate, score = self.pose_estimation(oriImg)
            nums, _, locs = candidate.shape
            candidate[..., 0] /= float(W)
            candidate[..., 1] /= float(H)
            body = candidate[:, :18].copy()
            body = body.reshape(nums * 18, locs)
            subset = score[:, :18].copy()
            for i in range(len(subset)):
                for j in range(len(subset[i])):
                    if subset[i][j] > 0.3:
                        subset[i][j] = int(18 * i + j)
                    else:
                        subset[i][j] = -1

            # un_visible = subset < 0.3
            # candidate[un_visible] = -1

            # foot = candidate[:, 18:24]

            faces = candidate[:, 24:92]

            hands = candidate[:, 92:113]
            hands = np.vstack([hands, candidate[:, 113:]])

            faces_score = score[:, 24:92]
            hands_score = np.vstack([score[:, 92:113], score[:, 113:]])

            bodies = dict(candidate=body, subset=subset, score=score[:, :18])
            pose = dict(bodies=bodies, hands=hands, hands_score=hands_score, faces=faces, faces_score=faces_score)

            return pose

dwpose_detector = DWposeDetector(
    model_det="./pretrained_weights/DWPose/yolox_l.onnx",
    model_pose="./pretrained_weights/DWPose/dw-ll_ucoco_384.onnx",
    device=device)


================================================
FILE: mimicmotion/dwpose/onnxdet.py
================================================
import cv2
import numpy as np


def nms(boxes, scores, nms_thr):
    """Single class NMS implemented in Numpy.

    Args:
        boxes (np.ndarray): shape=(N,4); N is number of boxes
        scores (np.ndarray): the score of bboxes
        nms_thr (float): the threshold in NMS 

    Returns:
        List[int]: output bbox ids
    """
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= nms_thr)[0]
        order = order[inds + 1]

    return keep

def multiclass_nms(boxes, scores, nms_thr, score_thr):
    """Multiclass NMS implemented in Numpy. Class-aware version.

    Args:
        boxes (np.ndarray): shape=(N,4); N is number of boxes
        scores (np.ndarray): the score of bboxes
        nms_thr (float): the threshold in NMS 
        score_thr (float): the threshold of cls score

    Returns:
        np.ndarray: outputs bboxes coordinate
    """
    final_dets = []
    num_classes = scores.shape[1]
    for cls_ind in range(num_classes):
        cls_scores = scores[:, cls_ind]
        valid_score_mask = cls_scores > score_thr
        if valid_score_mask.sum() == 0:
            continue
        else:
            valid_scores = cls_scores[valid_score_mask]
            valid_boxes = boxes[valid_score_mask]
            keep = nms(valid_boxes, valid_scores, nms_thr)
            if len(keep) > 0:
                cls_inds = np.ones((len(keep), 1)) * cls_ind
                dets = np.concatenate(
                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
                )
                final_dets.append(dets)
    if len(final_dets) == 0:
        return None
    return np.concatenate(final_dets, 0)

def demo_postprocess(outputs, img_size, p6=False):
    grids = []
    expanded_strides = []
    strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]

    hsizes = [img_size[0] // stride for stride in strides]
    wsizes = [img_size[1] // stride for stride in strides]

    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
        grids.append(grid)
        shape = grid.shape[:2]
        expanded_strides.append(np.full((*shape, 1), stride))

    grids = np.concatenate(grids, 1)
    expanded_strides = np.concatenate(expanded_strides, 1)
    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides

    return outputs

def preprocess(img, input_size, swap=(2, 0, 1)):
    if len(img.shape) == 3:
        padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
    else:
        padded_img = np.ones(input_size, dtype=np.uint8) * 114

    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
    resized_img = cv2.resize(
        img,
        (int(img.shape[1] * r), int(img.shape[0] * r)),
        interpolation=cv2.INTER_LINEAR,
    ).astype(np.uint8)
    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img

    padded_img = padded_img.transpose(swap)
    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
    return padded_img, r

def inference_detector(session, oriImg):
    """run human detect 
    """
    input_shape = (640,640)
    img, ratio = preprocess(oriImg, input_shape)

    ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
    output = session.run(None, ort_inputs)
    predictions = demo_postprocess(output[0], input_shape)[0]

    boxes = predictions[:, :4]
    scores = predictions[:, 4:5] * predictions[:, 5:]

    boxes_xyxy = np.ones_like(boxes)
    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
    boxes_xyxy /= ratio
    dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
    if dets is not None:
        final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
        isscore = final_scores>0.3
        iscat = final_cls_inds == 0
        isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
        final_boxes = final_boxes[isbbox]
    else:
        final_boxes = np.array([])

    return final_boxes


================================================
FILE: mimicmotion/dwpose/onnxpose.py
================================================
from typing import List, Tuple

import cv2
import numpy as np
import onnxruntime as ort

def preprocess(
    img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Do preprocessing for RTMPose model inference.

    Args:
        img (np.ndarray): Input image in shape.
        input_size (tuple): Input image size in shape (w, h).

    Returns:
        tuple:
        - resized_img (np.ndarray): Preprocessed image.
        - center (np.ndarray): Center of image.
        - scale (np.ndarray): Scale of image.
    """
    # get shape of image
    img_shape = img.shape[:2]
    out_img, out_center, out_scale = [], [], []
    if len(out_bbox) == 0:
        out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
    for i in range(len(out_bbox)):
        x0 = out_bbox[i][0]
        y0 = out_bbox[i][1]
        x1 = out_bbox[i][2]
        y1 = out_bbox[i][3]
        bbox = np.array([x0, y0, x1, y1])

        # get center and scale
        center, scale = bbox_xyxy2cs(bbox, padding=1.25)

        # do affine transformation
        resized_img, scale = top_down_affine(input_size, scale, center, img)

        # normalize image
        mean = np.array([123.675, 116.28, 103.53])
        std = np.array([58.395, 57.12, 57.375])
        resized_img = (resized_img - mean) / std

        out_img.append(resized_img)
        out_center.append(center)
        out_scale.append(scale)

    return out_img, out_center, out_scale


def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
    """Inference RTMPose model.

    Args:
        sess (ort.InferenceSession): ONNXRuntime session.
        img (np.ndarray): Input image in shape.

    Returns:
        outputs (np.ndarray): Output of RTMPose model.
    """
    all_out = []
    # build input
    for i in range(len(img)):
        input = [img[i].transpose(2, 0, 1)]

        # build output
        sess_input = {sess.get_inputs()[0].name: input}
        sess_output = []
        for out in sess.get_outputs():
            sess_output.append(out.name)

        # run model
        outputs = sess.run(sess_output, sess_input)
        all_out.append(outputs)

    return all_out


def postprocess(outputs: List[np.ndarray],
                model_input_size: Tuple[int, int],
                center: Tuple[int, int],
                scale: Tuple[int, int],
                simcc_split_ratio: float = 2.0
                ) -> Tuple[np.ndarray, np.ndarray]:
    """Postprocess for RTMPose model output.

    Args:
        outputs (np.ndarray): Output of RTMPose model.
        model_input_size (tuple): RTMPose model Input image size.
        center (tuple): Center of bbox in shape (x, y).
        scale (tuple): Scale of bbox in shape (w, h).
        simcc_split_ratio (float): Split ratio of simcc.

    Returns:
        tuple:
        - keypoints (np.ndarray): Rescaled keypoints.
        - scores (np.ndarray): Model predict scores.
    """
    all_key = []
    all_score = []
    for i in range(len(outputs)):
        # use simcc to decode
        simcc_x, simcc_y = outputs[i]
        keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)

        # rescale keypoints
        keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
        all_key.append(keypoints[0])
        all_score.append(scores[0])

    return np.array(all_key), np.array(all_score)


def bbox_xyxy2cs(bbox: np.ndarray,
                 padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
    """Transform the bbox format from (x,y,w,h) into (center, scale)

    Args:
        bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
            as (left, top, right, bottom)
        padding (float): BBox padding factor that will be multilied to scale.
            Default: 1.0

    Returns:
        tuple: A tuple containing center and scale.
        - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
            (n, 2)
        - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
            (n, 2)
    """
    # convert single bbox from (4, ) to (1, 4)
    dim = bbox.ndim
    if dim == 1:
        bbox = bbox[None, :]

    # get bbox center and scale
    x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
    center = np.hstack([x1 + x2, y1 + y2]) * 0.5
    scale = np.hstack([x2 - x1, y2 - y1]) * padding

    if dim == 1:
        center = center[0]
        scale = scale[0]

    return center, scale


def _fix_aspect_ratio(bbox_scale: np.ndarray,
                      aspect_ratio: float) -> np.ndarray:
    """Extend the scale to match the given aspect ratio.

    Args:
        scale (np.ndarray): The image scale (w, h) in shape (2, )
        aspect_ratio (float): The ratio of ``w/h``

    Returns:
        np.ndarray: The reshaped image scale in (2, )
    """
    w, h = np.hsplit(bbox_scale, [1])
    bbox_scale = np.where(w > h * aspect_ratio,
                          np.hstack([w, w / aspect_ratio]),
                          np.hstack([h * aspect_ratio, h]))
    return bbox_scale


def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
    """Rotate a point by an angle.

    Args:
        pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
        angle_rad (float): rotation angle in radian

    Returns:
        np.ndarray: Rotated point in shape (2, )
    """
    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
    rot_mat = np.array([[cs, -sn], [sn, cs]])
    return rot_mat @ pt


def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    """To calculate the affine matrix, three pairs of points are required. This
    function is used to get the 3rd point, given 2D points a & b.

    The 3rd point is defined by rotating vector `a - b` by 90 degrees
    anticlockwise, using b as the rotation center.

    Args:
        a (np.ndarray): The 1st point (x,y) in shape (2, )
        b (np.ndarray): The 2nd point (x,y) in shape (2, )

    Returns:
        np.ndarray: The 3rd point.
    """
    direction = a - b
    c = b + np.r_[-direction[1], direction[0]]
    return c


def get_warp_matrix(center: np.ndarray,
                    scale: np.ndarray,
                    rot: float,
                    output_size: Tuple[int, int],
                    shift: Tuple[float, float] = (0., 0.),
                    inv: bool = False) -> np.ndarray:
    """Calculate the affine transformation matrix that can warp the bbox area
    in the input image to the output size.

    Args:
        center (np.ndarray[2, ]): Center of the bounding box (x, y).
        scale (np.ndarray[2, ]): Scale of the bounding box
            wrt [width, height].
        rot (float): Rotation angle (degree).
        output_size (np.ndarray[2, ] | list(2,)): Size of the
            destination heatmaps.
        shift (0-100%): Shift translation ratio wrt the width/height.
            Default (0., 0.).
        inv (bool): Option to inverse the affine transform direction.
            (inv=False: src->dst or inv=True: dst->src)

    Returns:
        np.ndarray: A 2x3 transformation matrix
    """
    shift = np.array(shift)
    src_w = scale[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    # compute transformation matrix
    rot_rad = np.deg2rad(rot)
    src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
    dst_dir = np.array([0., dst_w * -0.5])

    # get four corners of the src rectangle in the original image
    src = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale * shift
    src[1, :] = center + src_dir + scale * shift
    src[2, :] = _get_3rd_point(src[0, :], src[1, :])

    # get four corners of the dst rectangle in the input image
    dst = np.zeros((3, 2), dtype=np.float32)
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return warp_mat


def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
                    img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Get the bbox image as the model input by affine transform.

    Args:
        input_size (dict): The input size of the model.
        bbox_scale (dict): The bbox scale of the img.
        bbox_center (dict): The bbox center of the img.
        img (np.ndarray): The original image.

    Returns:
        tuple: A tuple containing center and scale.
        - np.ndarray[float32]: img after affine transform.
        - np.ndarray[float32]: bbox scale after affine transform.
    """
    w, h = input_size
    warp_size = (int(w), int(h))

    # reshape bbox to fixed aspect ratio
    bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)

    # get the affine matrix
    center = bbox_center
    scale = bbox_scale
    rot = 0
    warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))

    # do affine transform
    img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)

    return img, bbox_scale


def get_simcc_maximum(simcc_x: np.ndarray,
                      simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Get maximum response location and value from simcc representations.

    Note:
        instance number: N
        num_keypoints: K
        heatmap height: H
        heatmap width: W

    Args:
        simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
        simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)

    Returns:
        tuple:
        - locs (np.ndarray): locations of maximum heatmap responses in shape
            (K, 2) or (N, K, 2)
        - vals (np.ndarray): values of maximum heatmap responses in shape
            (K,) or (N, K)
    """
    N, K, Wx = simcc_x.shape
    simcc_x = simcc_x.reshape(N * K, -1)
    simcc_y = simcc_y.reshape(N * K, -1)

    # get maximum value locations
    x_locs = np.argmax(simcc_x, axis=1)
    y_locs = np.argmax(simcc_y, axis=1)
    locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
    max_val_x = np.amax(simcc_x, axis=1)
    max_val_y = np.amax(simcc_y, axis=1)

    # get maximum value across x and y axis
    mask = max_val_x > max_val_y
    max_val_x[mask] = max_val_y[mask]
    vals = max_val_x
    locs[vals <= 0.] = -1

    # reshape
    locs = locs.reshape(N, K, 2)
    vals = vals.reshape(N, K)

    return locs, vals


def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
           simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
    """Modulate simcc distribution with Gaussian.

    Args:
        simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
        simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
        simcc_split_ratio (int): The split ratio of simcc.

    Returns:
        tuple: A tuple containing center and scale.
        - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
        - np.ndarray[float32]: scores in shape (K,) or (n, K)
    """
    keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
    keypoints /= simcc_split_ratio

    return keypoints, scores


def inference_pose(session, out_bbox, oriImg):
    """run pose detect 

    Args:
        session (ort.InferenceSession): ONNXRuntime session.
        out_bbox (np.ndarray): bbox list
        oriImg (np.ndarray): Input image in shape.

    Returns:
        tuple:
        - keypoints (np.ndarray): Rescaled keypoints.
        - scores (np.ndarray): Model predict scores.
    """
    h, w = session.get_inputs()[0].shape[2:]
    model_input_size = (w, h)
    # preprocess for rtm-pose model inference.
    resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
    # run pose estimation for processed img
    outputs = inference(session, resized_img)
    # postprocess for rtm-pose model output.
    keypoints, scores = postprocess(outputs, model_input_size, center, scale)

    return keypoints, scores


================================================
FILE: mimicmotion/dwpose/preprocess.py
================================================
from tqdm import tqdm
import decord
import numpy as np

from .util import draw_pose
from .dwpose_detector import dwpose_detector as dwprocessor

def get_video_pose(
        video_path: str, 
        ref_image: np.ndarray, 
        sample_stride: int=1):
    """preprocess ref image pose and video pose

    Args:
        video_path (str): video pose path
        ref_image (np.ndarray): reference image 
        sample_stride (int, optional): Defaults to 1.

    Returns:
        np.ndarray: sequence of video pose
    """
    # select ref-keypoint from reference pose for pose rescale
    ref_pose = dwprocessor(ref_image)
    ref_keypoint_id = [0, 1, 2, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
    ref_keypoint_id = [i for i in ref_keypoint_id \
        if len(ref_pose['bodies']['subset']) > 0 and ref_pose['bodies']['subset'][0][i] >= .0]
    ref_body = ref_pose['bodies']['candidate'][ref_keypoint_id]

    height, width, _ = ref_image.shape

    # read input video
    vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
    sample_stride *= max(1, int(vr.get_avg_fps() / 24))

    frames = vr.get_batch(list(range(0, len(vr), sample_stride))).asnumpy()
    detected_poses = [dwprocessor(frm) for frm in tqdm(frames, desc="DWPose")]
    dwprocessor.release_memory()

    detected_bodies = np.stack(
        [p['bodies']['candidate'] for p in detected_poses if p['bodies']['candidate'].shape[0] == 18])[:,
                      ref_keypoint_id]
    # compute linear-rescale params
    ay, by = np.polyfit(detected_bodies[:, :, 1].flatten(), np.tile(ref_body[:, 1], len(detected_bodies)), 1)
    fh, fw, _ = vr[0].shape
    ax = ay / (fh / fw / height * width)
    bx = np.mean(np.tile(ref_body[:, 0], len(detected_bodies)) - detected_bodies[:, :, 0].flatten() * ax)
    a = np.array([ax, ay])
    b = np.array([bx, by])
    output_pose = []
    # pose rescale 
    body_point = []
    face_point = []
    for detected_pose in detected_poses:
        detected_pose['bodies']['candidate'] = detected_pose['bodies']['candidate'] * a + b
        detected_pose['faces'] = detected_pose['faces'] * a + b
        detected_pose['hands'] = detected_pose['hands'] * a + b
        im = draw_pose(detected_pose, height, width)
        output_pose.append(np.array(im))
        body_point.append(detected_pose['bodies'])
        face_point.append(detected_pose['faces'])
    return np.stack(output_pose), body_point, face_point


def get_image_pose(ref_image):
    """process image pose

    Args:
        ref_image (np.ndarray): reference image pixel value

    Returns:
        np.ndarray: pose visual image in RGB-mode
    """
    height, width, _ = ref_image.shape
    ref_pose = dwprocessor(ref_image)
    pose_img = draw_pose(ref_pose, height, width)
    return np.array(pose_img), ref_pose


================================================
FILE: mimicmotion/dwpose/util.py
================================================
import math
import numpy as np
import matplotlib
import cv2
import pdb

eps = 0.01

def alpha_blend_color(color, alpha):
    """blend color according to point conf
    """
    return [int(c * alpha) for c in color]

def draw_bodypose(canvas, candidate, subset, score):
    H, W, C = canvas.shape
    candidate = np.array(candidate)
    subset = np.array(subset)

    stickwidth = 4

    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
               [1, 16], [16, 18], [3, 17], [6, 18]]

    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
              [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]

    for i in range(17):
        for n in range(len(subset)):
            index = subset[n][np.array(limbSeq[i]) - 1]
            conf = score[n][np.array(limbSeq[i]) - 1]
            if conf[0] < 0.3 or conf[1] < 0.3:
                continue
            Y = candidate[index.astype(int), 0] * float(W)
            X = candidate[index.astype(int), 1] * float(H)
            mX = np.mean(X)
            mY = np.mean(Y)
            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
            cv2.fillConvexPoly(canvas, polygon, alpha_blend_color(colors[i], conf[0] * conf[1]))

    canvas = (canvas * 0.6).astype(np.uint8)

    for i in range(18):
        for n in range(len(subset)):
            index = int(subset[n][i])
            if index == -1:
                continue
            x, y = candidate[index][0:2]
            conf = score[n][i]
            x = int(x * W)
            y = int(y * H)
            cv2.circle(canvas, (int(x), int(y)), 4, alpha_blend_color(colors[i], conf), thickness=-1)

    return canvas

def draw_handpose(canvas, all_hand_peaks, all_hand_scores):
    H, W, C = canvas.shape

    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
             [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]

    for peaks, scores in zip(all_hand_peaks, all_hand_scores):

        for ie, e in enumerate(edges):
            x1, y1 = peaks[e[0]]
            x2, y2 = peaks[e[1]]
            x1 = int(x1 * W)
            y1 = int(y1 * H)
            x2 = int(x2 * W)
            y2 = int(y2 * H)
            score = int(scores[e[0]] * scores[e[1]] * 255)
            if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
                cv2.line(canvas, (x1, y1), (x2, y2), 
                         matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * score, thickness=2)

        for i, keyponit in enumerate(peaks):
            x, y = keyponit
            x = int(x * W)
            y = int(y * H)
            score = int(scores[i] * 255)
            if x > eps and y > eps:
                cv2.circle(canvas, (x, y), 4, (0, 0, score), thickness=-1)
    return canvas

def draw_facepose(canvas, all_lmks, all_scores):
    H, W, C = canvas.shape
    for lmks, scores in zip(all_lmks, all_scores):
        for lmk, score in zip(lmks, scores):
            x, y = lmk
            x = int(x * W)
            y = int(y * H)
            conf = int(score * 255)
            if x > eps and y > eps:
                cv2.circle(canvas, (x, y), 3, (conf, conf, conf), thickness=-1)
    return canvas

def draw_pose(pose, H, W, ref_w=2160):
    """vis dwpose outputs

    Args:
        pose (List): DWposeDetector outputs in dwpose_detector.py
        H (int): height
        W (int): width
        ref_w (int, optional) Defaults to 2160.

    Returns:
        np.ndarray: image pixel value in RGB mode
    """
    bodies = pose['bodies']
    faces = pose['faces']
    hands = pose['hands']
    candidate = bodies['candidate']
    subset = bodies['subset']

    sz = min(H, W)
    sr = (ref_w / sz) if sz != ref_w else 1

    ########################################## create zero canvas ##################################################
    canvas = np.zeros(shape=(int(H*sr), int(W*sr), 3), dtype=np.uint8)

    ########################################### draw body pose #####################################################
    canvas = draw_bodypose(canvas, candidate, subset, score=bodies['score'])

    ########################################### draw hand pose #####################################################
    canvas = draw_handpose(canvas, hands, pose['hands_score'])

    ########################################### draw face pose #####################################################
    canvas = draw_facepose(canvas, faces, pose['faces_score'])

    return cv2.cvtColor(cv2.resize(canvas, (W, H)), cv2.COLOR_BGR2RGB).transpose(2, 0, 1)


================================================
FILE: mimicmotion/dwpose/wholebody.py
================================================
import numpy as np
import onnxruntime as ort

from .onnxdet import inference_detector
from .onnxpose import inference_pose


class Wholebody:
    """detect human pose by dwpose
    """
    def __init__(self, model_det, model_pose, device="cpu"):
        providers = ['CPUExecutionProvider'] if device == 'cpu' else ['CUDAExecutionProvider']
        provider_options = None if device == 'cpu' else [{'device_id': 0}]

        self.session_det = ort.InferenceSession(
            path_or_bytes=model_det, providers=providers,  provider_options=provider_options
        )
        self.session_pose = ort.InferenceSession(
            path_or_bytes=model_pose, providers=providers, provider_options=provider_options
        )
    
    def __call__(self, oriImg):
        """call to process dwpose-detect

        Args:
            oriImg (np.ndarray): detected image

        """
        det_result = inference_detector(self.session_det, oriImg)
        keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)

        keypoints_info = np.concatenate(
            (keypoints, scores[..., None]), axis=-1)
        # compute neck joint
        neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
        # neck score when visualizing pred
        neck[:, 2:4] = np.logical_and(
            keypoints_info[:, 5, 2:4] > 0.3,
            keypoints_info[:, 6, 2:4] > 0.3).astype(int)
        new_keypoints_info = np.insert(
            keypoints_info, 17, neck, axis=1)
        mmpose_idx = [
            17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
        ]
        openpose_idx = [
            1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
        ]
        new_keypoints_info[:, openpose_idx] = \
            new_keypoints_info[:, mmpose_idx]
        keypoints_info = new_keypoints_info

        keypoints, scores = keypoints_info[
            ..., :2], keypoints_info[..., 2]
        
        return keypoints, scores


================================================
FILE: mimicmotion/modules/__init__.py
================================================


================================================
FILE: mimicmotion/modules/attention.py
================================================
from dataclasses import dataclass
from typing import Any, Dict, Optional

import torch
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.models.attention import BasicTransformerBlock, TemporalBasicTransformerBlock
from diffusers.models.embeddings import TimestepEmbedding, Timesteps
from diffusers.models.modeling_utils import ModelMixin
from diffusers.models.resnet import AlphaBlender
from diffusers.utils import BaseOutput
from inspect import isfunction
import math
import torch.nn.functional as F
from torch import nn, einsum
from einops import rearrange, repeat

@dataclass
class TransformerTemporalModelOutput(BaseOutput):
    """
    The output of [`TransformerTemporalModel`].

    Args:
        sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input.
    """

    sample: torch.FloatTensor


class TransformerTemporalModel(ModelMixin, ConfigMixin):
    """
    A Transformer model for video-like data.

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the `TransformerBlock` attention should contain a bias parameter.
        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
            This is fixed during training since it is used to learn a number of position embeddings.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
            activation functions.
        norm_elementwise_affine (`bool`, *optional*):
            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
        double_self_attention (`bool`, *optional*):
            Configure if each `TransformerBlock` should contain two self-attention layers.
        positional_embeddings: (`str`, *optional*):
            The type of positional embeddings to apply to the sequence input before passing use.
        num_positional_embeddings: (`int`, *optional*):
            The maximum length of the sequence over which to apply positional embeddings.
    """

    @register_to_config
    def __init__(
            self,
            num_attention_heads: int = 16,
            attention_head_dim: int = 88,
            in_channels: Optional[int] = None,
            out_channels: Optional[int] = None,
            num_layers: int = 1,
            dropout: float = 0.0,
            norm_num_groups: int = 32,
            cross_attention_dim: Optional[int] = None,
            attention_bias: bool = False,
            sample_size: Optional[int] = None,
            activation_fn: str = "geglu",
            norm_elementwise_affine: bool = True,
            double_self_attention: bool = True,
            positional_embeddings: Optional[str] = None,
            num_positional_embeddings: Optional[int] = None,
    ):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim
        inner_dim = num_attention_heads * attention_head_dim

        self.in_channels = in_channels

        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
        self.proj_in = nn.Linear(in_channels, inner_dim)

        # 3. Define transformers blocks
        self.transformer_blocks = nn.ModuleList(
            [
                BasicTransformerBlock(
                    inner_dim,
                    num_attention_heads,
                    attention_head_dim,
                    dropout=dropout,
                    cross_attention_dim=cross_attention_dim,
                    activation_fn=activation_fn,
                    attention_bias=attention_bias,
                    double_self_attention=double_self_attention,
                    norm_elementwise_affine=norm_elementwise_affine,
                    positional_embeddings=positional_embeddings,
                    num_positional_embeddings=num_positional_embeddings,
                )
                for d in range(num_layers)
            ]
        )

        self.proj_out = nn.Linear(inner_dim, in_channels)

    def forward(
            self,
            hidden_states: torch.FloatTensor,
            encoder_hidden_states: Optional[torch.LongTensor] = None,
            timestep: Optional[torch.LongTensor] = None,
            class_labels: torch.LongTensor = None,
            num_frames: int = 1,
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            return_dict: bool = True,
    ) -> TransformerTemporalModelOutput:
        """
        The [`TransformerTemporal`] forward method.

        Args:
            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, 
                `torch.FloatTensor` of shape `(batch size, channel, height, width)`if continuous): Input hidden_states.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.LongTensor`, *optional*):
                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                `AdaLayerZeroNorm`.
            num_frames (`int`, *optional*, defaults to 1):
                The number of frames to be processed per batch. This is used to reshape the hidden states.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in [diffusers.models.attention_processor](
                https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
                returned, otherwise a `tuple` where the first element is the sample tensor.
        """
        # 1. Input
        batch_frames, channel, height, width = hidden_states.shape
        batch_size = batch_frames // num_frames

        residual = hidden_states

        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width)
        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)

        hidden_states = self.norm(hidden_states)
        hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel)

        hidden_states = self.proj_in(hidden_states)

        # 2. Blocks
        for block in self.transformer_blocks:
            hidden_states = block(
                hidden_states,
                encoder_hidden_states=encoder_hidden_states,
                timestep=timestep,
                cross_attention_kwargs=cross_attention_kwargs,
                class_labels=class_labels,
            )

        # 3. Output
        hidden_states = self.proj_out(hidden_states)
        hidden_states = (
            hidden_states[None, None, :]
            .reshape(batch_size, height, width, num_frames, channel)
            .permute(0, 3, 4, 1, 2)
            .contiguous()
        )
        hidden_states = hidden_states.reshape(batch_frames, channel, height, width)

        output = hidden_states + residual

        if not return_dict:
            return (output,)

        return TransformerTemporalModelOutput(sample=output)


class TransformerSpatioTemporalModel(nn.Module):
    """
    A Transformer model for video-like data.

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
        out_channels (`int`, *optional*):
            The number of channels in the output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
    """

    def __init__(
            self,
            num_attention_heads: int = 16,
            attention_head_dim: int = 88,
            in_channels: int = 320,
            out_channels: Optional[int] = None,
            num_layers: int = 1,
            cross_attention_dim: Optional[int] = None,
    ):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_dim = attention_head_dim

        inner_dim = num_attention_heads * attention_head_dim
        self.inner_dim = inner_dim

        # 2. Define input layers
        self.in_channels = in_channels
        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6)
        self.proj_in = nn.Linear(in_channels, inner_dim)

        # 3. Define transformers blocks
        self.transformer_blocks = nn.ModuleList(
            [
                BasicTransformerBlock(
                    inner_dim,
                    num_attention_heads,
                    attention_head_dim,
                    cross_attention_dim=cross_attention_dim,
                )
                for d in range(num_layers)
            ]
        )

        time_mix_inner_dim = inner_dim
        self.temporal_transformer_blocks = nn.ModuleList(
            [
                TemporalBasicTransformerBlock(
                    inner_dim,
                    time_mix_inner_dim,
                    num_attention_heads,
                    attention_head_dim,
                    cross_attention_dim=cross_attention_dim,
                )
                for _ in range(num_layers)
            ]
        )

        time_embed_dim = in_channels * 4
        self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels)
        self.time_proj = Timesteps(in_channels, True, 0)
        self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images")

        # 4. Define output layers
        self.out_channels = in_channels if out_channels is None else out_channels
        # TODO: should use out_channels for continuous projections
        self.proj_out = nn.Linear(inner_dim, in_channels)

        self.gradient_checkpointing = False

    def forward(
            self,
            hidden_states: torch.Tensor,
            encoder_hidden_states: Optional[torch.Tensor] = None,
            image_only_indicator: Optional[torch.Tensor] = None,
            return_dict: bool = True,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
                Input hidden_states.
            num_frames (`int`):
                The number of frames to be processed per batch. This is used to reshape the hidden states.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*):
                A tensor indicating whether the input contains only images. 1 indicates that the input contains only
                images, 0 indicates that the input contains video frames.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] 
                instead of a plain tuple.

        Returns:
            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
                returned, otherwise a `tuple` where the first element is the sample tensor.
        """
        # 1. Input
        batch_frames, _, height, width = hidden_states.shape
        num_frames = image_only_indicator.shape[-1]
        batch_size = batch_frames // num_frames

        time_context = encoder_hidden_states
        time_context_first_timestep = time_context[None, :].reshape(
            batch_size, num_frames, -1, time_context.shape[-1]
        )[:, 0]
        time_context = time_context_first_timestep[None, :].broadcast_to(
            height * width, batch_size, 1, time_context.shape[-1]
        )
        time_context = time_context.reshape(height * width * batch_size, 1, time_context.shape[-1])

        residual = hidden_states

        hidden_states = self.norm(hidden_states)
        inner_dim = hidden_states.shape[1]
        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_frames, height * width, inner_dim)
        hidden_states = torch.utils.checkpoint.checkpoint(self.proj_in, hidden_states)

        num_frames_emb = torch.arange(num_frames, device=hidden_states.device)
        num_frames_emb = num_frames_emb.repeat(batch_size, 1)
        num_frames_emb = num_frames_emb.reshape(-1)
        t_emb = self.time_proj(num_frames_emb)

        # `Timesteps` does not contain any weights and will always return f32 tensors
        # but time_embedding might actually be running in fp16. so we need to cast here.
        # there might be better ways to encapsulate this.
        t_emb = t_emb.to(dtype=hidden_states.dtype)

        emb = self.time_pos_embed(t_emb)
        emb = emb[:, None, :]

        # 2. Blocks
        for block, temporal_block in zip(self.transformer_blocks, self.temporal_transformer_blocks):
            if self.gradient_checkpointing:
                hidden_states = torch.utils.checkpoint.checkpoint(
                    block,
                    hidden_states,
                    None,
                    encoder_hidden_states,
                    None,
                    use_reentrant=False,
                )
            else:
                hidden_states = block(
                    hidden_states,
                    encoder_hidden_states=encoder_hidden_states,
                )

            hidden_states_mix = hidden_states
            hidden_states_mix = hidden_states_mix + emb

            if self.gradient_checkpointing:
                hidden_states_mix = torch.utils.checkpoint.checkpoint(
                    temporal_block,
                    hidden_states_mix,
                    num_frames,
                    time_context,
                )
                hidden_states = self.time_mixer(
                    x_spatial=hidden_states,
                    x_temporal=hidden_states_mix,
                    image_only_indicator=image_only_indicator,
                )
            else:
                hidden_states_mix = temporal_block(
                    hidden_states_mix,
                    num_frames=num_frames,
                    encoder_hidden_states=time_context,
                )
                hidden_states = self.time_mixer(
                    x_spatial=hidden_states,
                    x_temporal=hidden_states_mix,
                    image_only_indicator=image_only_indicator,
                )

        # 3. Output
        hidden_states = torch.utils.checkpoint.checkpoint(self.proj_out, hidden_states)
        hidden_states = hidden_states.reshape(batch_frames, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()

        output = hidden_states + residual

        if not return_dict:
            return (output,)

        return TransformerTemporalModelOutput(sample=output)


# from ldm.modules.diffusionmodules.util import checkpoint


def exists(val):
    return val is not None


def uniq(arr):
    return{el: True for el in arr}.keys()


def default(val, d):
    if exists(val):
        return val
    return d() if isfunction(d) else d


def max_neg_value(t):
    return -torch.finfo(t.dtype).max


def init_(tensor):
    dim = tensor.shape[-1]
    std = 1 / math.sqrt(dim)
    tensor.uniform_(-std, std)
    return tensor


# feedforward
class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.proj = nn.Linear(dim_in, dim_out * 2)

    def forward(self, x):
        x, gate = self.proj(x).chunk(2, dim=-1)
        return x * F.gelu(gate)


class FeedForward(nn.Module):
    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
        super().__init__()
        inner_dim = int(dim * mult)
        dim_out = default(dim_out, dim)
        project_in = nn.Sequential(
            nn.Linear(dim, inner_dim),
            nn.GELU()
        ) if not glu else GEGLU(dim, inner_dim)

        self.net = nn.Sequential(
            project_in,
            nn.Dropout(dropout),
            nn.Linear(inner_dim, dim_out)
        )

    def forward(self, x):
        return self.net(x)


def zero_module(module):
    """
    Zero out the parameters of a module and return it.
    """
    for p in module.parameters():
        p.detach().zero_()
    return module


def Normalize(in_channels):
    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)


class LinearAttention(nn.Module):
    def __init__(self, dim, heads=4, dim_head=32):
        super().__init__()
        self.heads = heads
        hidden_dim = dim_head * heads
        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
        self.to_out = nn.Conv2d(hidden_dim, dim, 1)

    def forward(self, x):
        b, c, h, w = x.shape
        qkv = self.to_qkv(x)
        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
        k = k.softmax(dim=-1)  
        context = torch.einsum('bhdn,bhen->bhde', k, v)
        out = torch.einsum('bhde,bhdn->bhen', context, q)
        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
        return self.to_out(out)


class SpatialSelfAttention(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.in_channels = in_channels

        self.norm = Normalize(in_channels)
        self.q = torch.nn.Conv2d(in_channels,
                                 in_channels,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)
        self.k = torch.nn.Conv2d(in_channels,
                                 in_channels,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)
        self.v = torch.nn.Conv2d(in_channels,
                                 in_channels,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)
        self.proj_out = torch.nn.Conv2d(in_channels,
                                        in_channels,
                                        kernel_size=1,
                                        stride=1,
                                        padding=0)

    def forward(self, x):
        h_ = x
        h_ = self.norm(h_)
        q = self.q(h_)
        k = self.k(h_)
        v = self.v(h_)

        # compute attention
        b,c,h,w = q.shape
        q = rearrange(q, 'b c h w -> b (h w) c')
        k = rearrange(k, 'b c h w -> b c (h w)')
        w_ = torch.einsum('bij,bjk->bik', q, k)

        w_ = w_ * (int(c)**(-0.5))
        w_ = torch.nn.functional.softmax(w_, dim=2)

        # attend to values
        v = rearrange(v, 'b c h w -> b c (h w)')
        w_ = rearrange(w_, 'b i j -> b j i')
        h_ = torch.einsum('bij,bjk->bik', v, w_)
        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
        h_ = self.proj_out(h_)

        return x+h_


class CrossAttention(nn.Module):
    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
        super().__init__()
        inner_dim = dim_head * heads
        context_dim = default(context_dim, query_dim)

        self.scale = dim_head ** -0.5
        self.heads = heads

        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, query_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, context=None, mask=None):
        h = self.heads

        q = self.to_q(x)
        context = default(context, x)
        k = self.to_k(context)
        v = self.to_v(context)

        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))

        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale

        if exists(mask):
            mask = rearrange(mask, 'b ... -> b (...)')
            max_neg_value = -torch.finfo(sim.dtype).max
            mask = repeat(mask, 'b j -> (b h) () j', h=h)
            sim.masked_fill_(~mask, max_neg_value)

        # attention, what we cannot get enough of
        attn = sim.softmax(dim=-1)

        out = einsum('b i j, b j d -> b i d', attn, v)
        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
        return self.to_out(out)


class BasicTransformerBlock(nn.Module):
    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True):
        super().__init__()
        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout)  # is a self-attention
        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
                                    heads=n_heads, dim_head=d_head, dropout=dropout)  # is self-attn if context is none
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.norm3 = nn.LayerNorm(dim)

    def forward(self, x, context=None):
        x = self.attn1(self.norm1(x)) + x
        x = self.attn2(self.norm2(x), context=context) + x
        x = self.ff(self.norm3(x)) + x
        return x


class SpatialTransformer(nn.Module):
    """
    Transformer block for image-like data.
    First, project the input (aka embedding)
    and reshape to b, t, d.
    Then apply standard transformer action.
    Finally, reshape to image
    """
    def __init__(self, in_channels, n_heads=8, d_head=64,
                 depth=1, dropout=0., context_dim=None):
        super().__init__()
        self.in_channels = in_channels
        inner_dim = n_heads * d_head
        self.norm = Normalize(in_channels)

        self.proj_in = nn.Conv2d(in_channels,
                                 inner_dim,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)

        self.transformer_blocks = nn.ModuleList(
            [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
                for d in range(depth)]
        )

        self.proj_out = zero_module(nn.Conv2d(inner_dim,
                                              in_channels,
                                              kernel_size=1,
                                              stride=1,
                                              padding=0))

    def forward(self, x, context=None):
        # note: if no context is given, cross-attention defaults to self-attention
        b, c, h, w = x.shape
        x_in = x
        x = self.norm(x)
        x = self.proj_in(x)
        x = rearrange(x, 'b c h w -> b (h w) c')
        for block in self.transformer_blocks:
            x = block(x, context=context)
        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
        x = self.proj_out(x)
        return x + x_in, x

================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/config.yaml
================================================
model:
    arch: CMP
    total_iter: 140000
    lr_steps: [80000, 120000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: alexnet_fcn_32x
        sparse_encoder: shallownet32x
        flow_decoder: MotionDecoderPlain
        skip_layer: False
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 12
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 416
    crop_size: [384, 384]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 0.000025
    nms_ks: 81
    max_num_guide: 150

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/yfcc/lists/train.txt
        - data/youtube9000/lists/train.txt
    val_source:
        - data/yfcc/lists/val.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 10000
    save_freq: 10000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: False


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/config.yaml
================================================
model:
    arch: CMP
    total_iter: 70000
    lr_steps: [40000, 60000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: alexnet_fcn_32x
        sparse_encoder: shallownet32x
        flow_decoder: MotionDecoderPlain
        skip_layer: False
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 12
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 416
    crop_size: [384, 384]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 0.00015625
    nms_ks: 41
    max_num_guide: 150

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/yfcc/lists/train.txt
    val_source:
        - data/yfcc/lists/val.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 10000
    save_freq: 10000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: False


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/config.yaml
================================================
model:
    arch: CMP
    total_iter: 140000
    lr_steps: [80000, 120000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: alexnet_fcn_32x
        sparse_encoder: shallownet32x
        flow_decoder: MotionDecoderPlain
        skip_layer: False
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 12
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 416
    crop_size: [384, 384]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 0.00015625
    nms_ks: 41
    max_num_guide: 150

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/yfcc/lists/train.txt
    val_source:
        - data/yfcc/lists/val.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 10000
    save_freq: 10000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: False


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/config.yaml
================================================
model:
    arch: CMP
    total_iter: 70000
    lr_steps: [40000, 60000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: resnet50
        sparse_encoder: shallownet8x
        flow_decoder: MotionDecoderPlain
        skip_layer: False
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 10
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 416
    crop_size: [320, 320]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 0.00015625
    nms_ks: 15
    max_num_guide: -1

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/yfcc/lists/train.txt
        - data/youtube9000/lists/train.txt
        - data/VIP/lists/train.txt
        - data/MPII/lists/train.txt
    val_source:
        - data/yfcc/lists/val.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 10000
    save_freq: 10000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: False


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/config.yaml
================================================
model:
    arch: CMP
    total_iter: 42000
    lr_steps: [24000, 36000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: resnet50
        sparse_encoder: shallownet8x
        flow_decoder: MotionDecoderPlain
        skip_layer: False
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 16
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 333
    crop_size: [256, 256]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 0.00005632
    nms_ks: 49
    max_num_guide: -1

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/yfcc/lists/train.txt
    val_source:
        - data/yfcc/lists/val.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 10000
    save_freq: 10000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: False


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/config.yaml
================================================
model:
    arch: CMP
    total_iter: 42000
    lr_steps: [24000, 36000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: resnet50
        sparse_encoder: shallownet8x
        flow_decoder: MotionDecoderPlain
        skip_layer: False
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 10
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 416
    crop_size: [320, 320]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 0.00003629
    nms_ks: 67
    max_num_guide: -1

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/yfcc/lists/train.txt
    val_source:
        - data/yfcc/lists/val.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 10000
    save_freq: 10000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: False


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 \
    --nnodes=2 --node_rank=$1 \
    --master_addr="192.168.1.1" main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml
================================================
model:
    arch: CMP
    total_iter: 42000
    lr_steps: [24000, 36000]
    lr_mults: [0.1, 0.1]
    lr: 0.1
    optim: SGD
    warmup_lr: []
    warmup_steps: []
    module:
        arch: CMP
        image_encoder: resnet50
        sparse_encoder: shallownet8x
        flow_decoder: MotionDecoderSkipLayer
        skip_layer: True
        img_enc_dim: 256
        sparse_enc_dim: 16
        output_dim: 198
        decoder_combo: [1,2,4]
        pretrained_image_encoder: False
        flow_criterion: "DiscreteLoss"
        nbins: 99
        fmax: 50
data:
    workers: 2
    batch_size: 8
    batch_size_test: 1
    data_mean: [123.675, 116.28, 103.53] # RGB
    data_div: [58.395, 57.12, 57.375]
    short_size: 416
    crop_size: [384, 384]
    sample_strategy: ['grid', 'watershed']
    sample_bg_ratio: 5.74e-5
    nms_ks: 41
    max_num_guide: -1

    flow_file_type: "jpg"
    image_flow_aug:
        flip: False
    flow_aug:
        reverse: False
        scale: False
        rotate: False
    train_source:
        - data/VIP/lists/train.txt
        - data/MPII/lists/train.txt
    val_source:
        - data/VIP/lists/randval.txt
    memcached: False
trainer:
    initial_val: True
    print_freq: 100
    val_freq: 5000
    save_freq: 5000
    val_iter: -1
    val_disp_start_iter: 0
    val_disp_end_iter: 16
    loss_record: ['loss_flow']
    tensorboard: True


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/resume.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 10000 \
    --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/resume_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm \
        --load-iter 10000 \
        --resume


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/train.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/train_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
    --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py \
        --config $work_path/config.yaml --launcher slurm


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/validate.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
python -m torch.distributed.launch --nproc_per_node=8 main.py \
    --config $work_path/config.yaml --launcher pytorch \
    --load-iter 70000 \
    --validate


================================================
FILE: mimicmotion/modules/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/validate_slurm.sh
================================================
#!/bin/bash
work_path=$(dirname $0)
partition=$1
GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition \
    -n8 --gres=gpu:8 --ntasks-per-node=8 \
    python -u main.py --config $work_path/config.yaml --launcher slurm \
        --load-iter 70000 \
        --validate


================================================
FILE: mimicmotion/modules/cmp/losses.py
================================================
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import random
import math

def MultiChannelSoftBinaryCrossEntropy(input, target, reduction='mean'):
    '''
    input: N x 38 x H x W --> 19N x 2 x H x W
    target: N x 19 x H x W --> 19N x 1 x H x W
    '''
    input = input.view(-1, 2, input.size(2), input.size(3))
    target = target.view(-1, 1, input.size(2), input.size(3))

    logsoftmax = nn.LogSoftmax(dim=1)
    if reduction == 'mean':
        return torch.mean(torch.sum(-target * logsoftmax(input), dim=1))
    else:
        return torch.sum(torch.sum(-target * logsoftmax(input), dim=1))

class EdgeAwareLoss():
    def __init__(self, nc=2, loss_type="L1", reduction='mean'):
        assert loss_type in ['L1', 'BCE'], "Undefined loss type: {}".format(loss_type)
        self.nc = nc
        self.loss_type = loss_type
        self.kernelx = Variable(torch.Tensor([[1,0,-1],[2,0,-2],[1,0,-1]]).cuda())
        self.kernelx = self.kernelx.repeat(nc,1,1,1)
        self.kernely = Variable(torch.Tensor([[1,2,1],[0,0,0],[-1,-2,-1]]).cuda())
        self.kernely = self.kernely.repeat(nc,1,1,1)
        self.bias = Variable(torch.zeros(nc).cuda())
        self.reduction = reduction
        if loss_type == 'L1':
            self.loss = nn.SmoothL1Loss(reduction=reduction)
        elif loss_type == 'BCE':
            self.loss = self.bce2d

    def bce2d(self, input, target):
        assert not target.requires_grad
        beta = 1 - torch.mean(target)
        weights = 1 - beta + (2 * beta - 1)  * target
        loss = nn.functional.binary_cross_entropy(input, target, weights, reduction=self.reduction)
        return loss

    def get_edge(self, var):
        assert var.size(1) == self.nc, \
            "input size at dim 1 should be consistent with nc, {} vs {}".format(var.size(1), self.nc)
        outputx = nn.functional.conv2d(var, self.kernelx, bias=self.bias, padding=1, groups=self.nc)
        outputy = nn.functional.conv2d(var, self.kernely, bias=self.bias, padding=1, groups=self.nc)
        eps=1e-05
        return torch.sqrt(outputx.pow(2) + outputy.pow(2) + eps).mean(dim=1, keepdim=True)

    def __call__(self, input, target):
        size = target.shape[2:4]
        input = nn.functional.interpolate(input, size=size, mode="bilinear", align_corners=True)
        target_edge = self.get_edge(target)
        if self.loss_type == 'L1':
            return self.loss(self.get_edge(input), target_edge)
        elif self.loss_type == 'BCE':
            raise NotImplemented
            #target_edge = torch.sign(target_edge - 0.1)
            #pred = self.get_edge(nn.functional.sigmoid(input))
            #return self.loss(pred, target_edge)

def KLD(mean, logvar):
    return -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())

class DiscreteLoss(nn.Module):
    def __init__(self, nbins, fmax):
        super().__init__()
        self.loss = nn.CrossEntropyLoss()
        assert nbins % 2 == 1, "nbins should be odd"
        self.nbins = nbins
        self.fmax = fmax
        self.step = 2 * fmax / float(nbins)

    def tobin(self, target):
        target = torch.clamp(target, -self.fmax + 1e-3, self.fmax - 1e-3)
        quantized_target = torch.floor((target + self.fmax) / self.step)
        return quantized_target.type(torch.cuda.LongTensor)

    def __call__(self, input, target):
        size = target.shape[2:4]
        if input.shape[2] != size[0] or input.shape[3] != size[1]:
            input = nn.functional.interpolate(input, size=size, mode="bilinear", align_corners=True)
        target = self.tobin(target)
        assert input.size(1) == self.nbins * 2
        # print(target.shape)
        # print(input.shape)
        # print(torch.max(target))
        target[target>=99]=98  # odd bugs of the training loss. We have [0 ~ 99] in GT flow, but nbins = 99
        return self.loss(input[:,:self.nbins,...], target[:,0,...]) + self.loss(input[:,self.nbins:,...], target[:,1,...])

class MultiDiscreteLoss():
    def __init__(self, nbins=19, fmax=47.5, reduction='mean', xy_weight=(1., 1.), quantize_strategy='linear'):
        self.loss = nn.CrossEntropyLoss(reduction=reduction)
        assert nbins % 2 == 1, "nbins should be odd"
        self.nbins = nbins
        self.fmax = fmax
        self.step = 2 * fmax / float(nbins)
        self.x_weight, self.y_weight = xy_weight
        self.quantize_strategy = quantize_strategy

    def tobin(self, target):
        target = torch.clamp(target, -self.fmax + 1e-3, self.fmax - 1e-3)
        if self.quantize_strategy == "linear":
            quantized_target = torch.floor((target + self.fmax) / self.step)
        elif self.quantize_strategy == "quadratic":
            ind = target.data > 0
            quantized_target = target.clone()
            quantized_target[ind] = torch.floor(self.nbins * torch.sqrt(target[ind] / (4 * self.fmax)) + self.nbins / 2.)
            quantized_target[~ind] = torch.floor(-self.nbins * torch.sqrt(-target[~ind] / (4 * self.fmax)) + self.nbins / 2.)
        return quantized_target.type(torch.cuda.LongTensor)

    def __call__(self, input, target):
        size = target.shape[2:4]
        target = self.tobin(target)
        if isinstance(input, list):
            input = [nn.functional.interpolate(ip, size=size, mode="bilinear", align_corners=True) for ip in input]
            return sum([self.x_weight * self.loss(input[k][:,:self.nbins,...], target[:,0,...]) + self.y_weight * self.loss(input[k][:,self.nbins:,...], target[:,1,...]) for k in range(len(input))]) / float(len(input))
        else:
            input = nn.functional.interpolate(input, size=size, mode="bilinear", align_corners=True)
            return self.x_weight * self.loss(input[:,:self.nbins,...], target[:,0,...]) + self.y_weight * self.loss(input[:,self.nbins:,...], target[:,1,...])

class MultiL1Loss():
    def __init__(self, reduction='mean'):
        self.loss = nn.SmoothL1Loss(reduction=reduction)

    def __call__(self, input, target):
        size = target.shape[2:4]
        if isinstance(input, list):
            input = [nn.functional.interpolate(ip, size=size, mode="bilinear", align_corners=True) for ip in input]
            return sum([self.loss(input[k], target) for k in range(len(input))]) / float(len(input))
        else:
            input = nn.functional.interpolate(input, size=size, mode="bilinear", align_corners=True)
            return self.loss(input, target)

class MultiMSELoss():
    def __init__(self):
        self.loss = nn.MSELoss()
    
    def __call__(self, predicts, targets):
        loss = 0
        for predict, target in zip(predicts, targets):
            loss += self.loss(predict, target)
        return loss
        
class JointDiscreteLoss():
    def __init__(self, nbins=19, fmax=47.5, reduction='mean', quantize_strategy='linear'):
        self.loss = nn.CrossEntropyLoss(reduction=reduction)
        assert nbins % 2 == 1, "nbins should be odd"
        self.nbins = nbins
        self.fmax = fmax
        self.step = 2 * fmax / float(nbins)
        self.quantize_strategy = quantize_strategy
        
    def tobin(self, target):
        target = torch.clamp(target, -self.fmax + 1e-3, self.fmax - 1e-3)
        if self.quantize_strategy == "linear":
            quantized_target = torch.floor((target + self.fmax) / self.step)
        elif self.quantize_strategy == "quadratic":
            ind = target.data > 0
            quantized_target = target.clone()
            quantized_target[ind] = torch.floor(self.nbins * torch.sqrt(target[ind] / (4 * self.fmax)) + self.nbins / 2.)
            quantized_target[~ind] = torch.floor(-self.nbins * torch.sqrt(-target[~ind] / (4 * self.fmax)) + self.nbins / 2.)
        else:
            raise Exception("No such quantize strategy: {}".format(self.quantize_strategy))
        joint_target = quantized_target[:,0,:,:] * self.nbins + quantized_target[:,1,:,:]
        return joint_target.type(torch.cuda.LongTensor)

    def __call__(self, input, target):
        target = self.tobin(target)
        assert input.size(1) == self.nbins ** 2
        return self.loss(input, target)

class PolarDiscreteLoss():
    def __init__(self, abins=30, rbins=20, fmax=50., reduction='mean', ar_weight=(1., 1.), quantize_strategy='linear'):
        self.loss = nn.CrossEntropyLoss(reduction=reduction)
        self.fmax = fmax
        self.rbins = rbins
        self.abins = abins
        self.a_weight, self.r_weight = ar_weight
        self.quantize_strategy = quantize_strategy

    def tobin(self, target):
        indxneg = target.data[:,0,:,:] < 0
        eps = torch.zeros(target.data[:,0,:,:].size()).cuda()
        epsind = target.data[:,0,:,:] == 0
        eps[epsind] += 1e-5
        angle = torch.atan(target.data[:,1,:,:] / (target.data[:,0,:,:] + eps))
        angle[indxneg] += np.pi
        angle += np.pi / 2 # 0 to 2pi
        angle = torch.clamp(angle, 0, 2 * np.pi - 1e-3)
        radius = torch.sqrt(target.data[:,0,:,:] ** 2 + target.data[:,1,:,:] ** 2)
        radius = torch.clamp(radius, 0, self.fmax - 1e-3)
        quantized_angle = torch.floor(self.abins * angle / (2 * np.pi))
        if self.quantize_strategy == 'linear':
            quantized_radius = torch.floor(self.rbins * radius / self.fmax)
        elif self.quantize_strategy == 'quadratic':
            quantized_radius = torch.floor(self.rbins * torch.sqrt(radius / self.fmax))
        else:
            raise Exception("No such quantize strategy: {}".format(self.quantize_strategy))
        quantized_target = torch.autograd.Variable(torch.cat([torch.unsqueeze(quantized_angle, 1), torch.unsqueeze(quantized_radius, 1)], dim=1))
        return quantized_target.type(torch.cuda.LongTensor)

    def __call__(self, input, target):
        target = self.tobin(target)
        assert (target >= 0).all() and (target[:,0,:,:] < self.abins).all() and (target[:,1,:,:] < self.rbins).all()
        return self.a_weight * self.loss(input[:,:self.abins,...], target[:,0,...]) + self.r_weight * self.loss(input[:,self.abins:,...], target[:,1,...])

class WeightedDiscreteLoss():
    def __init__(self, nbins=19, fmax=47.5, reduction='mean'):
        self.loss = CrossEntropy2d(reduction=reduction)
        assert nbins % 2 == 1, "nbins should be odd"
        self.nbins = nbins
        self.fmax = fmax
        self.step = 2 * fmax / float(nbins)
        self.weight = np.ones((nbins), dtype=np.float32)
        self.weight[int(self.fmax / self.step)] = 0.01
        self.weight = torch.from_numpy(self.weight).cuda()

    def tobin(self, target):
        target = torch.clamp(target, -self.fmax + 1e-3, self.fmax - 1e-3)
        return torch.floor((target + self.fmax) / self.step).type(torch.cuda.LongTensor)

    def __call__(self, input, target):
        target = self.tobin(target)
        assert (target >= 0).all() and (target < self.nbins).all()
        return self.loss(input[:,:self.nbins,...], target[:,0,...]) + self.loss(input[:,self.nbins:,...], target[:,1,...], self.weight)


class CrossEntropy2d(nn.Module):
    def __init__(self, reduction='mean', ignore_label=-1):
        super(CrossEntropy2d, self).__init__()
        self.ignore_label = ignore_label
        self.reduction = reduction

    def forward(self, predict, target, weight=None):
        """
            Args:
                predict:(n, c, h, w)
                target:(n, h, w)
                weight (Tensor, optional): a manual rescaling weight given to each class.
                                           If given, has to be a Tensor of size "nclasses"
        """
        assert not target.requires_grad
        assert predict.dim() == 4
        assert target.dim() == 3
        assert predict.size(0) == target.size(0), "{0} vs {1} ".format(predict.size(0), target.size(0))
        assert predict.size(2) == target.size(1), "{0} vs {1} ".format(predict.size(2), target.size(1))
        assert predict.size(3) == target.size(2), "{0} vs {1} ".format(predict.size(3), target.size(3))
        n, c, h, w = predict.size()
        target_mask = (target >= 0) * (target != self.ignore_label)
        target = target[target_mask]
        predict = predict.transpose(1, 2).transpose(2, 3).contiguous()
        predict = predict[target_mask.view(n, h, w, 1).repeat(1, 1, 1, c)].view(-1, c)
        loss = F.cross_entropy(predict, target, weight=weight, reduction=self.reduction)
        return loss

#class CrossPixelSimilarityLoss():
#    '''
#        Modified from: https://github.com/lppllppl920/Challenge2018/blob/master/loss.py
#    '''
#    def __init__(self, sigma=0.0036, sampling_size=512):
#        self.sigma = sigma
#        self.sampling_size = sampling_size
#        self.epsilon = 1.0e-15
#        self.embed_norm = True # loss does not decrease no matter it is true or false.
#
#    def __call__(self, embeddings, flows):
#        '''
#            embedding: Variable Nx256xHxW (not hyper-column)
#            flows: Variable Nx2xHxW
#        '''
#        assert flows.size(1) == 2
#
#        # flow normalization
#        positive_mask = (flows > 0)
#        flows = -torch.clamp(torch.log(torch.abs(flows) + 1) / math.log(50. + 1), max=1.)
#        flows[positive_mask] = -flows[positive_mask]
#
#        # embedding normalization
#        if self.embed_norm:
#            embeddings /= torch.norm(embeddings, p=2, dim=1, keepdim=True)
#
#        # Spatially random sampling (512 samples)
#        flows_flatten = flows.view(flows.shape[0], 2, -1)
#        random_locations = Variable(torch.from_numpy(np.array(random.sample(range(flows_flatten.shape[2]), self.sampling_size))).long().cuda())
#        flows_sample = torch.index_select(flows_flatten, 2, random_locations)
#
#        # K_f
#        k_f = self.epsilon + torch.norm(torch.unsqueeze(flows_sample, dim=-1).permute(0, 3, 2, 1) -
#                                        torch.unsqueeze(flows_sample, dim=-1).permute(0, 2, 3, 1), p=2, dim=3,
#                                        keepdim=False) ** 2
#        exp_k_f = torch.exp(-k_f / 2. / self.sigma)
#
#        
#        # mask
#        eye = Variable(torch.unsqueeze(torch.eye(k_f.shape[1]), dim=0).cuda())
#        mask = torch.ones_like(exp_k_f) - eye
#
#        # S_f
#        masked_exp_k_f = torch.mul(mask, exp_k_f) + eye
#        s_f = masked_exp_k_f / torch.sum(masked_exp_k_f, dim=1, keepdim=True)
#
#        # K_theta
#        embeddings_flatten = embeddings.view(embeddings.shape[0], embeddings.shape[1], -1)
#        embeddings_sample = torch.index_select(embeddings_flatten, 2, random_locations)
#        embeddings_sample_norm = torch.norm(embeddings_sample, p=2, dim=1, keepdim=True)
#        k_theta = 0.25 * (torch.matmul(embeddings_sample.permute(0, 2, 1), embeddings_sample)) / (self.epsilon + torch.matmul(embeddings_sample_norm.permute(0, 2, 1), embeddings_sample_norm))
#        exp_k_theta = torch.exp(k_theta)
#
#        # S_theta
#        masked_exp_k_theta = torch.mul(mask, exp_k_theta) + math.exp(-0.75) * eye
#        s_theta = masked_exp_k_theta / torch.sum(masked_exp_k_theta, dim=1, keepdim=True)
#
#        # loss
#        loss = -torch.mean(torch.mul(s_f, torch.log(s_theta)))
#
#        return loss

class CrossPixelSimilarityLoss():
    '''
        Modified from: https://github.com/lppllppl920/Challenge2018/blob/master/loss.py
    '''
    def __init__(self, sigma=0.01, sampling_size=512):
        self.sigma = sigma
        self.sampling_size = sampling_size
        self.epsilon = 1.0e-15
        self.embed_norm = True # loss does not decrease no matter it is true or false.

    def __call__(self, embeddings, flows):
        '''
            embedding: Variable Nx256xHxW (not hyper-column)
            flows: Variable Nx2xHxW
        '''
        assert flows.size(1) == 2

        # flow normalization
        positive_mask = (flows > 0)
        flows = -torch.clamp(torch.log(torch.abs(flows) + 1) / math.log(50. + 1), max=1.)
        flows[positive_mask] = -flows[positive_mask]

        # embedding normalization
        if self.embed_norm:
            embeddings /= torch.norm(embeddings, p=2, dim=1, keepdim=True)

        # Spatially random sampling (512 samples)
        flows_flatten = flows.view(flows.shape[0], 2, -1)
        random_locations = Variable(torch.from_numpy(np.array(random.sample(range(flows_flatten.shape[2]), self.sampling_size))).long().cuda())
        flows_sample = torch.index_select(flows_flatten, 2, random_locations)

        # K_f
        k_f = self.epsilon + torch.norm(torch.unsqueeze(flows_sample, dim=-1).permute(0, 3, 2, 1) -
                                        torch.unsqueeze(flows_sample, dim=-1).permute(0, 2, 3, 1), p=2, dim=3,
                                        keepdim=False) ** 2
        exp_k_f = torch.exp(-k_f / 2. / self.sigma)

        
        # mask
        eye = Variable(torch.unsqueeze(torch.eye(k_f.shape[1]), dim=0).cuda())
        mask = torch.ones_like(exp_k_f) - eye

        # S_f
        masked_exp_k_f = torch.mul(mask, exp_k_f) + eye
        s_f = masked_exp_k_f / torch.sum(masked_exp_k_f, dim=1, keepdim=True)

        # K_theta
        embeddings_flatten = embeddings.view(embeddings.shape[0], embeddings.shape[1], -1)
        embeddings_sample = torch.index_select(embeddings_flatten, 2, random_locations)
        embeddings_sample_norm = torch.norm(embeddings_sample, p=2, dim=1, keepdim=True)
        k_theta = 0.25 * (torch.matmul(embeddings_sample.permute(0, 2, 1), embeddings_sample)) / (self.epsilon + torch.matmul(embeddings_sample_norm.permute(0, 2, 1), embeddings_sample_norm))
        exp_k_theta = torch.exp(k_theta)

        # S_theta
        masked_exp_k_theta = torch.mul(mask, exp_k_theta) + eye
        s_theta = masked_exp_k_theta / torch.sum(masked_exp_k_theta, dim=1, keepdim=True)

        # loss
        loss = -torch.mean(torch.mul(s_f, torch.log(s_theta)))

        return loss


class CrossPixelSimilarityFullLoss():
    '''
        Modified from: https://github.com/lppllppl920/Challenge2018/blob/master/loss.py
    '''
    def __init__(self, sigma=0.01):
        self.sigma = sigma
        self.epsilon = 1.0e-15
        self.embed_norm = True # loss does not decrease no matter it is true or false.

    def __call__(self, embeddings, flows):
        '''
            embedding: Variable Nx256xHxW (not hyper-column)
            flows: Variable Nx2xHxW
        '''
        assert flows.size(1) == 2

        # downsample flow
        factor = flows.shape[2] // embeddings.shape[2]
        flows = nn.functional.avg_pool2d(flows, factor, factor)
        assert flows.shape[2] == embeddings.shape[2]

        # flow normalization
        positive_mask = (flows > 0)
        flows = -torch.clamp(torch.log(torch.abs(flows) + 1) / math.log(50. + 1), max=1.)
        flows[positive_mask] = -flows[positive_mask]

        # embedding normalization
        if self.embed_norm:
            embeddings /= torch.norm(embeddings, p=2, dim=1, keepdim=True)

        # Spatially random sampling (512 samples)
        flows_flatten = flows.view(flows.shape[0], 2, -1)
        #random_locations = Variable(torch.from_numpy(np.array(random.sample(range(flows_flatten.shape[2]), self.sampling_size))).long().cuda())
        #flows_sample = torch.index_select(flows_flatten, 2, random_locations)

        # K_f
        k_f = self.epsilon + torch.norm(torch.unsqueeze(flows_flatten, dim=-1).permute(0, 3, 2, 1) -
                                        torch.unsqueeze(flows_flatten, dim=-1).permute(0, 2, 3, 1), p=2, dim=3,
                                        keepdim=False) ** 2
        exp_k_f = torch.exp(-k_f / 2. / self.sigma)

        
        # mask
        eye = Variable(torch.unsqueeze(torch.eye(k_f.shape[1]), dim=0).cuda())
        mask = torch.ones_like(exp_k_f) - eye

        # S_f
        masked_exp_k_f = torch.mul(mask, exp_k_f) + eye
        s_f = masked_exp_k_f / torch.sum(masked_exp_k_f, dim=1, keepdim=True)

        # K_theta
        embeddings_flatten = embeddings.view(embeddings.shape[0], embeddings.shape[1], -1)
        #embeddings_sample = torch.index_select(embeddings_flatten, 2, random_locations)
        embeddings_flatten_norm = torch.norm(embeddings_flatten, p=2, dim=1, keepdim=True)
        k_theta = 0.25 * (torch.matmul(embeddings_flatten.permute(0, 2, 1), embeddings_flatten)) / (self.epsilon + torch.matmul(embeddings_flatten_norm.permute(0, 2, 1), embeddings_flatten_norm))
        exp_k_theta = torch.exp(k_theta)

        # S_theta
        masked_exp_k_theta = torch.mul(mask, exp_k_theta) + eye
        s_theta = masked_exp_k_theta / torch.sum(masked_exp_k_theta, dim=1, keepdim=True)

        # loss
        loss = -torch.mean(torch.mul(s_f, torch.log(s_theta)))

        return loss


def get_column(embeddings, index, full_size):
    col = []
    for embd in embeddings:
        ind = (index.float() / full_size * embd.size(2)).long()
        col.append(torch.index_select(embd.view(embd.shape[0], embd.shape[1], -1), 2, ind))
    return torch.cat(col, dim=1) # N x coldim x sparsenum

class CrossPixelSimilarityColumnLoss(nn.Module):
    '''
        Modified from: https://github.com/lppllppl920/Challenge2018/blob/master/loss.py
    '''
    def __init__(self, sigma=0.0036, sampling_size=512):
        super(CrossPixelSimilarityColumnLoss, self).__init__()
        self.sigma = sigma
        self.sampling_size = sampling_size
        self.epsilon = 1.0e-15
        self.embed_norm = True # loss does not decrease no matter it is true or false.
        self.mlp = nn.Sequential(
            nn.Linear(96 + 96 + 384 + 256 + 4096, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 16))

    def forward(self, feats, flows):
        '''
            embedding: Variable Nx256xHxW (not hyper-column)
            flows: Variable Nx2xHxW
        '''
        assert flows.size(1) == 2

        # flow normalization
        positive_mask = (flows > 0)
        flows = -torch.clamp(torch.log(torch.abs(flows) + 1) / math.log(50. + 1), max=1.)
        flows[positive_mask] = -flows[positive_mask]

        # Spatially random sampling (512 samples)
        flows_flatten = flows.view(flows.shape[0], 2, -1)
        random_locations = Variable(torch.from_numpy(np.array(random.sample(range(flows_flatten.shape[2]), self.sampling_size))).long().cuda())
        flows_sample = torch.index_select(flows_flatten, 2, random_locations)

        # K_f
        k_f = self.epsilon + torch.norm(torch.unsqueeze(flows_sample, dim=-1).permute(0, 3, 2, 1) -
                                        torch.unsqueeze(flows_sample, dim=-1).permute(0, 2, 3, 1), p=2, dim=3,
                                        keepdim=False) ** 2
        exp_k_f = torch.exp(-k_f / 2. / self.sigma)

        
        # mask
        eye = Variable(torch.unsqueeze(torch.eye(k_f.shape[1]), dim=0).cuda())
        mask = torch.ones_like(exp_k_f) - eye

        # S_f
        masked_exp_k_f = torch.mul(mask, exp_k_f) + eye
        s_f = masked_exp_k_f / torch.sum(masked_exp_k_f, dim=1, keepdim=True)


        # column
        column = get_column(feats, random_locations, flows.shape[2])
        embedding = self.mlp(column)
        # K_theta
        embedding_norm = torch.norm(embedding, p=2, dim=1, keepdim=True)
        k_theta = 0.25 * (torch.matmul(embedding.permute(0, 2, 1), embedding)) / (self.epsilon + torch.matmul(embedding_norm.permute(0, 2, 1), embedding_norm))
        exp_k_theta = torch.exp(k_theta)

        # S_theta
        masked_exp_k_theta = torch.mul(mask, exp_k_theta) + math.exp(-0.75) * eye
        s_theta = masked_exp_k_theta / torch.sum(masked_exp_k_theta, dim=1, keepdim=True)

        # loss
        loss = -torch.mean(torch.mul(s_f, torch.log(s_theta)))

        return loss


def print_info(name, var):
    print(name, var.size(), torch.max(var).data.cpu()[0], torch.min(var).data.cpu()[0], torch.mean(var).data.cpu()[0])


def MaskL1Loss(input, target, mask):
    input_size = input.size()
    res = torch.sum(torch.abs(input * mask - target * mask))
    total = torch.sum(mask).item()
    if total > 0:
        res = res / (total * input_size[1])
    return res


================================================
FILE: mimicmotion/modules/cmp/models/__init__.py
================================================
from .single_stage_model import *
from .cmp import *
from . import modules
from . import backbone


================================================
FILE: mimicmotion/modules/cmp/models/backbone/__init__.py
================================================
from .resnet import *
from .alexnet import *


================================================
FILE: mimicmotion/modules/cmp/models/backbone/alexnet.py
================================================
import torch.nn as nn
import math

class AlexNetBN_FCN(nn.Module):

    def __init__(self, output_dim=256, stride=[4, 2, 2, 2], dilation=[1, 1], padding=[1, 1]):
        super(AlexNetBN_FCN, self).__init__()
        BN = nn.BatchNorm2d

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=stride[0], padding=5),
            BN(96),
            nn.ReLU(inplace=True))
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=stride[1], padding=1)
        self.conv2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, padding=2),
            BN(256),
            nn.ReLU(inplace=True))
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=stride[2], padding=1)
        self.conv3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, padding=1),
            BN(384),
            nn.ReLU(inplace=True))
        self.conv4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, padding=padding[0], dilation=dilation[0]),
            BN(384),
            nn.ReLU(inplace=True))
        self.conv5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, padding=padding[1], dilation=dilation[1]),
            BN(256),
            nn.ReLU(inplace=True))
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=stride[3], padding=1)

        self.fc6 = nn.Sequential(
            nn.Conv2d(256, 4096, kernel_size=3, stride=1, padding=1),
            BN(4096),
            nn.ReLU(inplace=True))
        self.drop6 = nn.Dropout(0.5)
        self.fc7 = nn.Sequential(
            nn.Conv2d(4096, 4096, kernel_size=1, stride=1, padding=0),
            BN(4096),
            nn.ReLU(inplace=True))
        self.drop7 = nn.Dropout(0.5)
        self.conv8 = nn.Conv2d(4096, output_dim, kernel_size=1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                fan_in = m.out_channels * m.kernel_size[0] * m.kernel_size[1]
                scale = math.sqrt(2. / fan_in)
                m.weight.data.uniform_(-scale, scale)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x, ret_feat=False):
        if ret_feat:
            raise NotImplemented
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.pool5(x)
        x = self.fc6(x)
        x = self.drop6(x)
        x = self.fc7(x)
        x = self.drop7(x)
        x = self.conv8(x)
        return x

def alexnet_fcn_32x(output_dim, pretrained=False, **kwargs):
    assert pretrained == False
    model = AlexNetBN_FCN(output_dim=output_dim, **kwargs)
    return model

def alexnet_fcn_8x(output_dim, use_ppm=False, pretrained=False, **kwargs):
    assert pretrained == False
    model = AlexNetBN_FCN(output_dim=output_dim, stride=[2, 2, 2, 1], **kwargs)
    return model


================================================
FILE: mimicmotion/modules/cmp/models/backbone/resnet.py
================================================
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

BN = None


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = BN(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = BN(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = BN(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = BN(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = BN(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, output_dim, block, layers):
        
        global BN

        BN = nn.BatchNorm2d

        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = BN(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        
        self.conv5 = nn.Conv2d(2048, output_dim, kernel_size=1)

        ## dilation
        for n, m in self.layer3.named_modules():
            if 'conv2' in n:
                m.dilation, m.padding, m.stride = (2, 2), (2, 2), (1, 1)
            elif 'downsample.0' in n:
                m.stride = (1, 1)
        for n, m in self.layer4.named_modules():
            if 'conv2' in n:
                m.dilation, m.padding, m.stride = (4, 4), (4, 4), (1, 1)
            elif 'downsample.0' in n:
                m.stride = (1, 1)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                BN(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, img, ret_feat=False):
        x = self.conv1(img) # 1/2
        x = self.bn1(x)
        conv1 = self.relu(x) # 1/2
        pool1 = self.maxpool(conv1) # 1/4

        layer1 = self.layer1(pool1) # 1/4
        layer2 = self.layer2(layer1) # 1/8
        layer3 = self.layer3(layer2) # 1/8
        layer4 = self.layer4(layer3) # 1/8
        out = self.conv5(layer4)

        if ret_feat:
            return out, [img, conv1, layer1] # 3, 64, 256
        else:
            return out

def resnet18(output_dim, pretrained=False):
    model = ResNet(output_dim, BasicBlock, [2, 2, 2, 2])
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34(output_dim, pretrained=False):
    model = ResNet(output_dim, BasicBlock, [3, 4, 6, 3])
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50(output_dim, pretrained=False):
    model = ResNet(output_dim, Bottleneck, [3, 4, 6, 3])
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
    return model

def resnet101(output_dim, pretrained=False):
    model = ResNet(output_dim, Bottleneck, [3, 4, 23, 3])
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False)
    return model


def resnet152(output_dim, pretrained=False):
    model = ResNet(output_dim, Bottleneck, [3, 8, 36, 3])
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']), strict=False)
    return model


================================================
FILE: mimicmotion/modules/cmp/models/cmp.py
================================================
import torch
import torch.nn as nn

import mimicmotion.modules.cmp.losses as losses
import mimicmotion.modules.cmp.utils as utils

from . import SingleStageModel

class CMP(SingleStageModel):

    def __init__(self, params, dist_model=False):
        super(CMP, self).__init__(params, dist_model)
        model_params = params['module']

        # define loss
        if model_params['flow_criterion'] == 'L1':
            self.flow_criterion = nn.SmoothL1Loss()
        elif model_params['flow_criterion'] == 'L2':
            self.flow_criterion = nn.MSELoss()
        elif model_params['flow_criterion'] == 'DiscreteLoss':
            self.flow_criterion = losses.DiscreteLoss(
                nbins=model_params['nbins'], fmax=model_params['fmax'])
        else:
            raise Exception("No such flow loss: {}".format(model_params['flow_criterion']))

        self.fuser = utils.Fuser(nbins=model_params['nbins'],
                                 fmax=model_params['fmax'])
        self.model_params = model_params

    def eval(self, ret_loss=True):
        with torch.no_grad():
            cmp_output = self.model(self.image_input, self.sparse_input)
        if self.model_params['flow_criterion'] == "DiscreteLoss":
            self.flow = self.fuser.convert_flow(cmp_output)
        else:
            self.flow = cmp_output
        if self.flow.shape[2] != self.image_input.shape[2]:
            self.flow = nn.functional.interpolate(
                self.flow, size=self.image_input.shape[2:4],
                mode="bilinear", align_corners=True)

        ret_tensors = {
            'flow_tensors': [self.flow, self.flow_target],
            'common_tensors': [],
            'rgb_tensors': []} # except for image_input

        if ret_loss:
            if cmp_output.shape[2] != self.flow_target.shape[2]:
                cmp_output = nn.functional.interpolate(
                    cmp_output, size=self.flow_target.shape[2:4],
                    mode="bilinear", align_corners=True)
            loss_flow = self.flow_criterion(cmp_output, self.flow_target) / self.world_size
            return ret_tensors, {'loss_flow': loss_flow}
        else:   
            return ret_tensors

    def step(self):
        cmp_output = self.model(self.image_input, self.sparse_input)
        loss_flow = self.flow_criterion(cmp_output, self.flow_target) / self.world_size
        self.optim.zero_grad()
        loss_flow.backward()
        utils.average_gradients(self.model)
        self.optim.step()
        return {'loss_flow': loss_flow}


================================================
FILE: mimicmotion/modules/cmp/models/modules/__init__.py
================================================
from .warp import *
from .others import *
from .shallownet import *
from .decoder import *
from .cmp import *


================================================
FILE: mimicmotion/modules/cmp/models/modules/cmp.py
================================================
import torch
import torch.nn as nn
import mimicmotion.modules.cmp.models as models


class CMP(nn.Module):

    def __init__(self, params):
        super(CMP, self).__init__()
        img_enc_dim = params['img_enc_dim']
        sparse_enc_dim = params['sparse_enc_dim']
        output_dim = params['output_dim']
        pretrained = params['pretrained_image_encoder']
        decoder_combo = params['decoder_combo']
        self.skip_layer = params['skip_layer']
        if self.skip_layer:
            assert params['flow_decoder'] == "MotionDecoderSkipLayer"

        self.image_encoder = models.backbone.__dict__[params['image_encoder']](
            img_enc_dim, pretrained)
        self.flow_encoder = models.modules.__dict__[params['sparse_encoder']](
            sparse_enc_dim)
        self.flow_decoder = models.modules.__dict__[params['flow_decoder']](
            input_dim=img_enc_dim+sparse_enc_dim,
            output_dim=output_dim, combo=decoder_combo)

    def forward(self, image, sparse):
        sparse_enc = self.flow_encoder(sparse)
        if self.skip_layer:
            img_enc, skip_feat = self.image_encoder(image, ret_feat=True)
            flow_dec = self.flow_decoder(torch.cat((img_enc, sparse_enc), dim=1), skip_feat)
        else:
            img_enc = self.image_encoder(image)
            flow_dec = self.flow_decoder(torch.cat((img_enc, sparse_enc), dim=1))
        return flow_dec


================================================
FILE: mimicmotion/modules/cmp/models/modules/decoder.py
================================================
import torch
import torch.nn as nn
import math

class MotionDecoderPlain(nn.Module):

    def __init__(self, input_dim=512, output_dim=2, combo=[1,2,4]):
        super(MotionDecoderPlain, self).__init__()
        BN = nn.BatchNorm2d

        self.combo = combo
        for c in combo:
            assert c in [1,2,4,8], "invalid combo: {}".format(combo)

        if 1 in combo:
            self.decoder1 = nn.Sequential(
                nn.Conv2d(input_dim, 128, kernel_size=3, padding=1),
                BN(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(128, 128, kernel_size=3, padding=1),
                BN(128),
                nn.ReLU(inplace=True))

        if 2 in combo:
            self.decoder2 = nn.Sequential(
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
                BN(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(128, 128, kernel_size=3, padding=1),
                BN(128),
                nn.ReLU(inplace=True))

        if 4 in combo:
            self.decoder4 = nn.Sequential(
                nn.MaxPool2d(kernel_size=4, stride=4),
                nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
                BN(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(128, 128, kernel_size=3, padding=1),
                BN(128),
                nn.ReLU(inplace=True))

        if 8 in combo:
            self.decoder8 = nn.Sequential(
                nn.MaxPool2d(kernel_size=8, stride=8),
                nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
                BN(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(128, 128, kernel_size=3, padding=1),
                BN(128),
                nn.ReLU(inplace=True))

        self.head = nn.Conv2d(128 * len(self.combo), output_dim, kernel_size=1, padding=0)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                fan_in = m.out_channels * m.kernel_size[0] * m.kernel_size[1]
                scale = math.sqrt(2. / fan_in)
                m.weight.data.uniform_(-scale, scale)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                if not m.weight is None:
                    m.weight.data.fill_(1)
                if not m.bias is None:
                    m.bias.data.zero_()

    def forward(self, x):
        
        cat_list = []
        if 1 in self.combo:
            x1 = self.decoder1(x)
            cat_list.append(x1)
        if 2 in self.combo:
            x2 = nn.functional.interpolate(
                self.decoder2(x), size=(x.size(2), x.size(3)),
                mode="bilinear", align_corners=True)
            cat_list.append(x2)
        if 4 in self.combo:
            x4 = nn.functional.interpolate(
                self.decoder4(x), size=(x.size(2), x.size(3)),
                mode="bilinear", align_corners=True)
            cat_list.append(x4)
        if 8 in self.combo:
            x8 = nn.functional.interpolate(
                self.decoder8(x), size=(x.size(2), x.size(3)),
                mode="bilinear", align_corners=True)
            cat_list.append(x8)
           
        cat = torch.cat(cat_list, dim=1)
        flow = self.head(cat)
        return flow


class MotionDecoderSkipLayer(nn.Module):

    def __init__(self, input_dim=512, output_dim=2, combo=[1,2,4,8]):
        super(MotionDecoderSkipLayer, self).__init__()

        BN = nn.BatchNorm2d

        self.decoder1 = nn.Sequential(
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.decoder2 = nn.Sequential(
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.decoder4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.decoder8 = nn.Sequential(
            nn.MaxPool2d(kernel_size=8, stride=8),
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.fusion8 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=3, padding=1),
            BN(256),
            nn.ReLU(inplace=True))

        self.skipconv4 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))
        self.fusion4 = nn.Sequential(
            nn.Conv2d(256 + 128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.skipconv2 = nn.Sequential(
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            BN(32),
            nn.ReLU(inplace=True))
        self.fusion2 = nn.Sequential(
            nn.Conv2d(128 + 32, 64, kernel_size=3, padding=1),
            BN(64),
            nn.ReLU(inplace=True))

        self.head = nn.Conv2d(64, output_dim, kernel_size=1, padding=0)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                fan_in = m.out_channels * m.kernel_size[0] * m.kernel_size[1]
                scale = math.sqrt(2. / fan_in)
                m.weight.data.uniform_(-scale, scale)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                if not m.weight is None:
                    m.weight.data.fill_(1)
                if not m.bias is None:
                    m.bias.data.zero_()

    def forward(self, x, skip_feat):
        layer1, layer2, layer4 = skip_feat

        x1 = self.decoder1(x)
        x2 = nn.functional.interpolate(
            self.decoder2(x), size=(x1.size(2), x1.size(3)),
            mode="bilinear", align_corners=True)
        x4 = nn.functional.interpolate(
            self.decoder4(x), size=(x1.size(2), x1.size(3)),
            mode="bilinear", align_corners=True)
        x8 = nn.functional.interpolate(
            self.decoder8(x), size=(x1.size(2), x1.size(3)),
            mode="bilinear", align_corners=True)
        cat = torch.cat([x1, x2, x4, x8], dim=1)
        f8 = self.fusion8(cat)

        f8_up = nn.functional.interpolate(
            f8, size=(layer4.size(2), layer4.size(3)),
            mode="bilinear", align_corners=True)
        f4 = self.fusion4(torch.cat([f8_up, self.skipconv4(layer4)], dim=1))

        f4_up = nn.functional.interpolate(
            f4, size=(layer2.size(2), layer2.size(3)),
            mode="bilinear", align_corners=True)
        f2 = self.fusion2(torch.cat([f4_up, self.skipconv2(layer2)], dim=1))

        flow = self.head(f2)
        return flow


class MotionDecoderFlowNet(nn.Module):

    def __init__(self, input_dim=512, output_dim=2, combo=[1,2,4,8]):
        super(MotionDecoderFlowNet, self).__init__()
        global BN

        BN = nn.BatchNorm2d

        self.decoder1 = nn.Sequential(
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.decoder2 = nn.Sequential(
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.decoder4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=4, stride=4),
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.decoder8 = nn.Sequential(
            nn.MaxPool2d(kernel_size=8, stride=8),
            nn.Conv2d(input_dim, 128, kernel_size=3, padding=1, stride=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            BN(128),
            nn.ReLU(inplace=True))

        self.fusion8 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=3, padding=1),
            BN(256),
            nn.ReLU(inplace=True))

        # flownet head
        self.predict_flow8 = predict_flow(256, output_dim)
        self.predict_flow4 = predict_flow(384 + output_dim, output_dim)
        self.predict_flow2 = predict_flow(192 + output_dim, output_dim)
        self.predict_flow1 = predict_flow(67 + output_dim, output_dim)

        self.upsampled_flow8_to_4 = nn.ConvTranspose2d(
            output_dim, output_dim, 4, 2, 1, bias=False)
        self.upsampled_flow4_to_2 = nn.ConvTranspose2d(
            output_dim, output_dim, 4, 2, 1, bias=False)
        self.upsampled_flow2_to_1 = nn.ConvTranspose2d(
            output_dim, output_dim, 4, 2, 1, bias=False)

        self.deconv8 = deconv(256, 128)
        self.deconv4 = deconv(384 + output_dim, 128)
        self.deconv2 = deconv(192 + output_dim, 64)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                fan_in = m.out_channels * m.kernel_size[0] * m.kernel_size[1]
                scale = math.sqrt(2. / fan_in)
                m.weight.data.uniform_(-scale, scale)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                if not m.weight is None:
                    m.weight.data.fill_(1)
                if not m.bias is None:
                    m.bias.data.zero_()

    def forward(self, x, skip_feat):
        layer1, layer2, layer4 = skip_feat # 3, 64, 256

        # propagation nets
        x1 = self.decoder1(x)
        x2 = nn.functional.interpolate(
            self.decoder2(x), size=(x1.size(2), x1.size(3)),
            mode="bilinear", align_corners=True)
        x4 = nn.functional.interpolate(
            self.decoder4(x), size=(x1.size(2), x1.size(3)),
            mode="bilinear", align_corners=True)
        x8 = nn.functional.interpolate(
            self.decoder8(x), size=(x1.size(2), x1.size(3)),
            mode="bilinear", align_corners=True)
        cat = torch.cat([x1, x2, x4, x8], dim=1)
        feat8 = self.fusion8(cat) # 256

        # flownet head
        flow8 = self.predict_flow8(feat8)
        flow8_up = self.upsampled_flow8_to_4(flow8)
        out_deconv8 = self.deconv8(feat8) # 128

        concat4 = torch.cat((layer4, out_deconv8, flow8_up), dim=1) # 394 + out
        flow4 = self.predict_flow4(concat4)
        flow4_up = self.upsampled_flow4_to_2(flow4)
        out_deconv4 = self.deconv4(concat4) # 128

        concat2 = torch.cat((layer2, out_deconv4, flow4_up), dim=1) # 192 + out
        flow2 = self.predict_flow2(concat2)
        flow2_up = self.upsampled_flow2_to_1(flow2)
        out_deconv2 = self.deconv2(concat2) # 64

        concat1 = torch.cat((layer1, out_deconv2, flow2_up), dim=1) # 67 + out
        flow1 = self.predict_flow1(concat1)
        
        return [flow1, flow2, flow4, flow8]


def predict_flow(in_planes, out_planes):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3,
                     stride=1, padding=1, bias=True)


def deconv(in_planes, out_planes):
    return nn.Sequential(
        nn.ConvTranspose2d(in_planes, out_planes, kernel_size=4,
                           stride=2, padding=1, bias=True),
        nn.LeakyReLU(0.1, inplace=True)
    )


================================================
FILE: mimicmotion/modules/cmp/models/modules/others.py
================================================
import torch.nn as nn

class FixModule(nn.Module):

    def __init__(self, m):
        super(FixModule, self).__init__()
        self.module = m

    def forward(self, *args, **kwargs):
        return self.module(*args, **kwargs)


================================================
FILE: mimicmotion/modules/cmp/models/modules/shallownet.py
================================================
import torch.nn as nn
import math

class ShallowNet(nn.Module):

    def __init__(self, input_dim=4, output_dim=16, stride=[2, 2, 2]):
        super(ShallowNet, self).__init__()
        global BN

        BN = nn.BatchNorm2d

        self.features = nn.Sequential(
            nn.Conv2d(input_dim, 16, kernel_size=5, stride=stride[0], padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=stride[1], stride=stride[1]),
            nn.Conv2d(16, output_dim, kernel_size=3, padding=1),
            nn.BatchNorm2d(output_dim),
            nn.ReLU(inplace=True),
            nn.AvgPool2d(kernel_size=stride[2], stride=stride[2]),
        )
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                fan_in = m.out_channels * m.kernel_size[0] * m.kernel_size[1]
                scale = math.sqrt(2. / fan_in)
                m.weight.data.uniform_(-scale, scale)
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                if not m.weight is None:
                    m.weight.data.fill_(1)
                if not m.bias is None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        return x


def shallownet8x(output_dim):
    model = ShallowNet(output_dim=output_dim, stride=[2,2,2])
    return model

def shallownet32x(output_dim, **kwargs):
    model = ShallowNet(output_dim=output_dim, stride=[2,2,8])
    return model


================================================
FILE: mimicmotion/modules/cmp/models/modules/warp.py
================================================
import torch
import torch.nn as nn

class WarpingLayerBWFlow(nn.Module):

    def __init__(self):
        super(WarpingLayerBWFlow, self).__init__()

    def forward(self, image, flow):
        flow_for_grip = torch.zeros_like(flow)
        flow_for_grip[:,0,:,:] = flow[:,0,:,:] / ((flow.size(3) - 1.0) / 2.0)
        flow_for_grip[:,1,:,:] = flow[:,1,:,:] / ((flow.size(2) - 1.0) / 2.0)

        torchHorizontal = torch.linspace(
            -1.0, 1.0, image.size(3)).view(
            1, 1, 1, image.size(3)).expand(
            image.size(0), 1, image.size(2), image.size(3))
        torchVertical = torch.linspace(
            -1.0, 1.0, image.size(2)).view(
            1, 1, image.size(2), 1).expand(
            image.size(0), 1, image.size(2), image.size(3))
        grid = torch.cat([torchHorizontal, torchVertical], 1).cuda()

        grid = (grid + flow_for_grip).permute(0, 2, 3, 1)
        return torch.nn.functional.grid_sample(image, grid)


class WarpingLayerFWFlow(nn.Module):

    def __init__(self):
        super(WarpingLayerFWFlow, self).__init__()
        self.initialized = False

    def forward(self, image, flow, ret_mask = False):
        n, h, w = image.size(0), image.size(2), image.size(3)

        if not self.initialized or n != self.meshx.shape[0] or h * w != self.meshx.shape[1]:
            self.meshx = torch.arange(w).view(1, 1, w).expand(
                n, h, w).contiguous().view(n, -1).cuda()
            self.meshy = torch.arange(h).view(1, h, 1).expand(
                n, h, w).contiguous().view(n, -1).cuda()
            self.warped_image = torch.zeros((n, 3, h, w), dtype=torch.float32).cuda()
            if ret_mask:
                self.hole_mask = torch.ones((n, 1, h, w), dtype=torch.float32).cuda()
            self.initialized = True
        
        v = (flow[:,0,:,:] ** 2 + flow[:,1,:,:] ** 2).view(n, -1)
        _, sortidx = torch.sort(v, dim=1)

        warped_meshx = self.meshx + flow[:,0,:,:].long().view(n, -1)
        warped_meshy = self.meshy + flow[:,1,:,:].long().view(n, -1)
        
        warped_meshx = torch.clamp(warped_meshx, 0, w - 1)
        warped_meshy = torch.clamp(warped_meshy, 0, h - 1)
        
        self.warped_image.zero_()
        if ret_mask:
            self.hole_mask.fill_(1.)
        for i in range(n):
            for c in range(3):
                ind = sortidx[i]
                self.warped_image[i,c,warped_meshy[i][ind],warped_meshx[i][ind]] = image[i,c,self.meshy[i][ind],self.meshx[i][ind]]
            if ret_mask:
                self.hole_mask[i,0,warped_meshy[i],warped_meshx[i]] = 0.
        if ret_mask:
            return self.warped_image, self.hole_mask
        else:
            return self.warped_image


================================================
FILE: mimicmotion/modules/cmp/models/single_stage_model.py
================================================
import os
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist

import mimicmotion.modules.cmp.models as models
import mimicmotion.modules.cmp.utils as utils


class SingleStageModel(object):

    def __init__(self, params, dist_model=False):
        model_params = params['module']
        self.model = models.modules.__dict__[params['module']['arch']](model_params)
        utils.init_weights(self.model, init_type='xavier')
        self.model.cuda()
        if dist_model:
            self.model = utils.DistModule(self.model)
            self.world_size = dist.get_world_size()
        else:
            self.model = models.modules.FixModule(self.model)
            self.world_size = 1

        if params['optim'] == 'SGD':
            self.optim = torch.optim.SGD(
                self.model.parameters(), lr=params['lr'],
                momentum=0.9, weight_decay=0.0001)
        elif params['optim'] == 'Adam':
            self.optim = torch.optim.Adam(
                self.model.parameters(), lr=params['lr'],
                betas=(params['beta1'], 0.999))
        else:   
            raise Exception("No such optimizer: {}".format(params['optim']))

        cudnn.benchmark = True

    def set_input(self, image_input, sparse_input, flow_target=None, rgb_target=None):
        self.image_input = image_input
        self.sparse_input = sparse_input
        self.flow_target = flow_target
        self.rgb_target = rgb_target

    def eval(self, ret_loss=True):
        pass

    def step(self):
        pass

    def load_state(self, path, Iter, resume=False):
        path = os.path.join(path, "ckpt_iter_{}.pth.tar".format(Iter))

        if resume:
            utils.load_state(path, self.model, self.optim)
        else:
            utils.load_state(path, self.model)

    def load_pretrain(self, load_path):
        utils.load_state(load_path, self.model)

    def save_state(self, path, Iter):
        path = os.path.join(path, "ckpt_iter_{}.pth.tar".format(Iter))

        torch.save({
            'step': Iter,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optim.state_dict()}, path)

    def switch_to(self, phase):
        if phase == 'train':
            self.model.train()
        else:
            self.model.eval()


================================================
FILE: mimicmotion/modules/cmp/utils/__init__.py
================================================
from .common_utils import *
from .data_utils import *
from .distributed_utils import *
from .visualize_utils import *
from .scheduler import *
from . import flowlib


================================================
FILE: mimicmotion/modules/cmp/utils/common_utils.py
================================================
import os
import logging
import numpy as np

import torch
from torch.nn import init

def init_weights(net, init_type='normal', init_gain=0.02):
    """Initialize network weights.
    Parameters:
        net (network)   -- network to be initialized
        init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
        init_gain (float)    -- scaling factor for normal, xavier and orthogonal.
    We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might
    work better for some applications. Feel free to try yourself.
    """
    def init_func(m):  # define the initialization function
        classname = m.__class__.__name__
        if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
            if init_type == 'normal':
                init.normal_(m.weight.data, 0.0, init_gain)
            elif init_type == 'xavier':
                init.xavier_normal_(m.weight.data, gain=init_gain)
            elif init_type == 'kaiming':
                init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
            elif init_type == 'orthogonal':
                init.orthogonal_(m.weight.data, gain=init_gain)
            else:
                raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
            if hasattr(m, 'bias') and m.bias is not None:
                init.constant_(m.bias.data, 0.0)
        elif classname.find('BatchNorm2d') != -1:  # BatchNorm Layer's weight is not a matrix; only normal distribution applies.
            init.normal_(m.weight.data, 1.0, init_gain)
            init.constant_(m.bias.data, 0.0)

    net.apply(init_func)  # apply the initialization function <init_func>

def create_logger(name, log_file, level=logging.INFO):
    l = logging.getLogger(name)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    fh = logging.FileHandler(log_file)
    fh.setFormatter(formatter)
    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    l.setLevel(level)
    l.addHandler(fh)
    l.addHandler(sh)
    return l

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, length=0):
        self.length = length
        self.reset()

    def reset(self):
        if self.length > 0:
            self.history = []
        else:
            self.count = 0
            self.sum = 0.0
        self.val = 0.0
        self.avg = 0.0

    def update(self, val):
        if self.length > 0:
            self.history.append(val)
            if len(self.history) > self.length:
                del self.history[0]

            self.val = self.history[-1]
            self.avg = np.mean(self.history)
        else:
            self.val = val
            self.sum += val
            self.count += 1
            self.avg = self.sum / self.count
            
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdims=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

def load_state(path, model, optimizer=None):
    def map_func(storage, location):
        return storage.cuda()
    if os.path.isfile(path):
        print("=> loading checkpoint '{}'".format(path))
        checkpoint = torch.load(path, map_location=map_func)
        model.load_state_dict(checkpoint['state_dict'], strict=False)
        ckpt_keys = set(checkpoint['state_dict'].keys())
        own_keys = set(model.state_dict().keys())
        missing_keys = own_keys - ckpt_keys
        # print(ckpt_keys)
        # print(own_keys)
        # for k in missing_keys:
        #     print('caution: missing keys from checkpoint {}: {}'.format(path, k))

        last_iter = checkpoint['step']
        if optimizer != None:
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> also loaded optimizer from checkpoint '{}' (iter {})"
                  .format(path, last_iter))
        return last_iter
    else:
        print("=> no checkpoint found at '{}'".format(path))


================================================
FILE: mimicmotion/modules/cmp/utils/data_utils.py
================================================
from PIL import Image, ImageOps
import scipy.ndimage as ndimage
import cv2
import random
import numpy as np
from scipy.ndimage.filters import maximum_filter
from scipy import signal
cv2.ocl.setUseOpenCL(False)

def get_edge(data, blur=False):
    if blur:
        data = cv2.GaussianBlur(data, (3, 3), 1.)
    sobel = np.array([[1,0,-1],[2,0,-2],[1,0,-1]]).astype(np.float32)
    ch_edges = []
    for k in range(data.shape[2]):
        edgex = signal.convolve2d(data[:,:,k], sobel, boundary='symm', mode='same')
        edgey = signal.convolve2d(data[:,:,k], sobel.T, boundary='symm', mode='same')
        ch_edges.append(np.sqrt(edgex**2 + edgey**2))
    return sum(ch_edges)

def get_max(score, bbox):
    u = max(0, bbox[0])
    d = min(score.shape[0], bbox[1])
    l = max(0, bbox[2])
    r = min(score.shape[1], bbox[3])
    return score[u:d,l:r].max()

def nms(score, ks):
    assert ks % 2 == 1
    ret_score = score.copy()
    maxpool = maximum_filter(score, footprint=np.ones((ks, ks)))
    ret_score[score < maxpool] = 0.
    return ret_score

def image_flow_crop(img1, img2, flow, crop_size, phase):
    assert len(crop_size) == 2
    pad_h = max(crop_size[0] - img1.height, 0)
    pad_w = max(crop_size[1] - img1.width, 0)
    pad_h_half = int(pad_h / 2)
    pad_w_half = int(pad_w / 2)
    if pad_h > 0 or pad_w > 0:
        flow_expand = np.zeros((img1.height + pad_h, img1.width + pad_w, 2), dtype=np.float32)
        flow_expand[pad_h_half:pad_h_half+img1.height, pad_w_half:pad_w_half+img1.width, :] = flow
        flow = flow_expand
        border = (pad_w_half, pad_h_half, pad_w - pad_w_half, pad_h - pad_h_half)
        img1 = ImageOps.expand(img1, border=border, fill=(0,0,0))
        img2 = ImageOps.expand(img2, border=border, fill=(0,0,0))
    if phase == 'train':
        hoff = int(np.random.rand() * (img1.height - crop_size[0]))
        woff = int(np.random.rand() * (img1.width - crop_size[1]))
    else:
        hoff = (img1.height - crop_size[0]) // 2
        woff = (img1.width - crop_size[1]) // 2

    img1 = img1.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0]))
    img2 = img2.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0]))
    flow = flow[hoff:hoff+crop_size[0], woff:woff+crop_size[1], :]
    offset = (hoff, woff)
    return img1, img2, flow, offset

def image_crop(img, crop_size):
    pad_h = max(crop_size[0] - img.height, 0)
    pad_w = max(crop_size[1] - img.width, 0)
    pad_h_half = int(pad_h / 2)
    pad_w_half = int(pad_w / 2)
    if pad_h > 0 or pad_w > 0:
        border = (pad_w_half, pad_h_half, pad_w - pad_w_half, pad_h - pad_h_half)
        img = ImageOps.expand(img, border=border, fill=(0,0,0))
    hoff = (img.height - crop_size[0]) // 2
    woff = (img.width - crop_size[1]) // 2
    return img.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0])), (pad_w_half, pad_h_half)

def image_flow_resize(img1, img2, flow, short_size=None, long_size=None):
    assert (short_size is None) ^ (long_size is None)
    w, h = img1.width, img1.height
    if short_size is not None:
        if w < h:
            neww = short_size
            newh = int(short_size / float(w) * h)
        else:
            neww = int(short_size / float(h) * w)
            newh = short_size
    else:
        if w < h:
            neww = int(long_size / float(h) * w)
            newh = long_size
        else:
            neww = long_size
            newh = int(long_size / float(w) * h)
    img1 = img1.resize((neww, newh), Image.BICUBIC)
    img2 = img2.resize((neww, newh), Image.BICUBIC)
    ratio = float(newh) / h
    flow = cv2.resize(flow.copy(), (neww, newh), interpolation=cv2.INTER_LINEAR) * ratio
    return img1, img2, flow, ratio

def image_resize(img, short_size=None, long_size=None):
    assert (short_size is None) ^ (long_size is None)
    w, h = img.width, img.height
    if short_size is not None:
        if w < h:
            neww = short_size
            newh = int(short_size / float(w) * h)
        else:
            neww = int(short_size / float(h) * w)
            newh = short_size
    else:
        if w < h:
            neww = int(long_size / float(h) * w)
            newh = long_size
        else:
            neww = long_size
            newh = int(long_size / float(w) * h)
    img = img.resize((neww, newh), Image.BICUBIC)
    return img, [w, h]


def image_pose_crop(img, posemap, crop_size, scale):
    assert len(crop_size) == 2
    assert crop_size[0] <= img.height
    assert crop_size[1] <= img.width
    hoff = (img.height - crop_size[0]) // 2
    woff = (img.width - crop_size[1]) // 2
    img = img.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0]))
    posemap = posemap[hoff//scale:hoff//scale+crop_size[0]//scale, woff//scale:woff//scale+crop_size[1]//scale,:]
    return img, posemap

def neighbor_elim(ph, pw, d):
    valid = np.ones((len(ph))).astype(np.int)
    h_dist = np.fabs(np.tile(ph[:,np.newaxis], [1,len(ph)]) - np.tile(ph.T[np.newaxis,:], [len(ph),1]))
    w_dist = np.fabs(np.tile(pw[:,np.newaxis], [1,len(pw)]) - np.tile(pw.T[np.newaxis,:], [len(pw),1]))
    idx1, idx2 = np.where((h_dist < d) & (w_dist < d))
    for i,j in zip(idx1, idx2):
        if valid[i] and valid[j] and i != j:
            if np.random.rand() > 0.5:
                valid[i] = 0
            else:
                valid[j] = 0
    valid_idx = np.where(valid==1)
    return ph[valid_idx], pw[valid_idx]

def remove_border(mask):
        mask[0,:] = 0
        mask[:,0] = 0
        mask[mask.shape[0]-1,:] = 0
        mask[:,mask.shape[1]-1] = 0

def flow_sampler(flow, strategy=['grid'], bg_ratio=1./6400, nms_ks=15, max_num_guide=-1, guidepoint=None):
    assert bg_ratio >= 0 and bg_ratio <= 1, "sampling ratio must be in (0, 1]"
    for s in strategy:
        assert s in ['grid', 'uniform', 'gradnms', 'watershed', 'single', 'full', 'specified'], "No such strategy: {}".format(s)
    h = flow.shape[0]
    w = flow.shape[1]
    ds = max(1, max(h, w) // 400) # reduce computation

    if 'full' in strategy:
        sparse = flow.copy()
        mask = np.ones(flow.shape, dtype=np.int)
        return sparse, mask

    pts_h = []
    pts_w = []
    if 'grid' in strategy:
        stride = int(np.sqrt(1./bg_ratio))
        mesh_start_h = int((h - h // stride * stride) / 2)
        mesh_start_w = int((w - w // stride * stride) / 2)
        mesh = np.meshgrid(np.arange(mesh_start_h, h, stride), np.arange(mesh_start_w, w, stride))
        pts_h.append(mesh[0].flat)
        pts_w.append(mesh[1].flat)
    if 'uniform' in strategy:
        pts_h.append(np.random.randint(0, h, int(bg_ratio * h * w)))
        pts_w.append(np.random.randint(0, w, int(bg_ratio * h * w)))
    if "gradnms" in strategy:
        ks = w // ds // 20
        edge = get_edge(flow[::ds,::ds,:])
        kernel = np.ones((ks, ks), dtype=np.float32) / (ks * ks)
        subkernel = np.ones((ks//2, ks//2), dtype=np.float32) / (ks//2 * ks//2)
        score = signal.convolve2d(edge, kernel, boundary='symm', mode='same')
        subscore = signal.convolve2d(edge, subkernel, boundary='symm', mode='same')
        score = score / score.max() - subscore / subscore.max()
        nms_res = nms(score, nms_ks)
        pth, ptw = np.where(nms_res > 0.1)
        pts_h.append(pth * ds)
        pts_w.append(ptw * ds)
    if "watershed" in strategy:
        edge = get_edge(flow[::ds,::ds,:])
        edge /= max(edge.max(), 0.01)
        edge = (edge > 0.1).astype(np.float32)
        watershed = ndimage.distance_transform_edt(1-edge)
        nms_res = nms(watershed, nms_ks)
        remove_border(nms_res)
        pth, ptw = np.where(nms_res > 0)
        pth, ptw = neighbor_elim(pth, ptw, (nms_ks-1)/2)
        pts_h.append(pth * ds)
        pts_w.append(ptw * ds)
    if "single" in strategy:
        pth, ptw = np.where((flow[:,:,0] != 0) | (flow[:,:,1] != 0))
        randidx = np.random.randint(len(pth))
        pts_h.append(pth[randidx:randidx+1])
        pts_w.append(ptw[randidx:randidx+1])
    if 'specified' in strategy:
        assert guidepoint is not None, "if using \"specified\", switch \"with_info\" on."
        pts_h.append(guidepoint[:,1])
        pts_w.append(guidepoint[:,0])

    pts_h = np.concatenate(pts_h)
    pts_w = np.concatenate(pts_w)

    if max_num_guide == -1:
        max_num_guide = np.inf

    randsel = np.random.permutation(len(pts_h))[:len(pts_h)]
    selidx = randsel[np.arange(min(max_num_guide, len(randsel)))]
    pts_h = pts_h[selidx]
    pts_w = pts_w[selidx]

    sparse = np.zeros(flow.shape, dtype=flow.dtype)
    mask = np.zeros(flow.shape, dtype=np.int)
    
    sparse[:, :, 0][(pts_h, pts_w)] = flow[:, :, 0][(pts_h, pts_w)]
    sparse[:, :, 1][(pts_h, pts_w)] = flow[:, :, 1][(pts_h, pts_w)]
    
    mask[:,:,0][(pts_h, pts_w)] = 1
    mask[:,:,1][(pts_h, pts_w)] = 1
    return sparse, mask

def image_flow_aug(img1, img2, flow, flip_horizon=True):
    if flip_horizon:
        if random.random() < 0.5:
            img1 = img1.transpose(Image.FLIP_LEFT_RIGHT)
            img2 = img2.transpose(Image.FLIP_LEFT_RIGHT)
            flow = flow[:,::-1,:].copy()
            flow[:,:,0] = -flow[:,:,0]
    return img1, img2, flow

def flow_aug(flow, reverse=True, scale=True, rotate=True):
    if reverse:
        if random.random() < 0.5:
            flow = -flow
    if scale:
        rand_scale = random.uniform(0.5, 2.0)
        flow = flow * rand_scale
    if rotate and random.random() < 0.5:
        lengh = np.sqrt(np.square(flow[:,:,0]) + np.square(flow[:,:,1]))
        alpha = np.arctan(flow[:,:,1] / flow[:,:,0])
        theta = random.uniform(0, np.pi*2)
        flow[:,:,0] = lengh * np.cos(alpha + theta)
        flow[:,:,1] = lengh * np.sin(alpha + theta)
    return flow

def draw_gaussian(img, pt, sigma, type='Gaussian'):
    # Check that any part of the gaussian is in-bounds
    ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
    br = [int(pt[0] + 3 * sigma + 1), int(pt[1] + 3 * sigma + 1)]
    if (ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or
            br[0] < 0 or br[1] < 0):
        # If not, just return the image as is
        return img

    # Generate gaussian
    size = 6 * sigma + 1
    x = np.arange(0, size, 1, float)
    y = x[:, np.newaxis]
    x0 = y0 = size // 2
    # The gaussian is not normalized, we want the center value to equal 1
    if type == 'Gaussian':
        g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
    elif type == 'Cauchy':
        g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma ** 2) ** 1.5)

    # Usable gaussian range
    g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
    g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
    # Image range
    img_x = max(0, ul[0]), min(br[0], img.shape[1])
    img_y = max(0, ul[1]), min(br[1], img.shape[0])

    img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
    return img


================================================
FILE: mimicmotion/modules/cmp/utils/distributed_utils.py
================================================
import os
import subprocess
import numpy as np
import multiprocessing as mp
import math

import torch
import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.nn import Module

class DistModule(Module):
    def __init__(self, module):
        super(DistModule, self).__init__()
        self.module = module
        broadcast_params(self.module)
    def forward(self, *inputs, **kwargs):
        return self.module(*inputs, **kwargs)
    def train(self, mode=True):
        super(DistModule, self).train(mode)
        self.module.train(mode)

def average_gradients(model):
    """ average gradients """
    for param in model.parameters():
        if param.requires_grad:
            dist.all_reduce(param.grad.data)

def broadcast_params(model):
    """ broadcast model parameters """
    for p in model.state_dict().values():
        dist.broadcast(p, 0)

def dist_init(launcher, backend='nccl', **kwargs):
    if mp.get_start_method(allow_none=True) is None:
        mp.set_start_method('spawn')
    if launcher == 'pytorch':
        _init_dist_pytorch(backend, **kwargs)
    elif launcher == 'mpi':
        _init_dist_mpi(backend, **kwargs)
    elif launcher == 'slurm':
        _init_dist_slurm(backend, **kwargs)
    else:
        raise ValueError('Invalid launcher type: {}'.format(launcher))

def _init_dist_pytorch(backend, **kwargs):
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

def _init_dist_mpi(backend, **kwargs):
    raise NotImplementedError

def _init_dist_slurm(backend, port=10086, **kwargs):
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id % num_gpus)
    addr = subprocess.getoutput(
        'scontrol show hostname {} | head -n1'.format(node_list))
    os.environ['MASTER_PORT'] = str(port)
    os.environ['MASTER_ADDR'] = addr
    os.environ['WORLD_SIZE'] = str(ntasks)
    os.environ['RANK'] = str(proc_id)
    dist.init_process_group(backend=backend)

def gather_tensors(input_array):
    world_size = dist.get_world_size()
    ## gather shapes first
    myshape = input_array.shape
    mycount = input_array.size
    shape_tensor = torch.Tensor(np.array(myshape)).cuda()
    all_shape = [torch.Tensor(np.array(myshape)).cuda() for i in range(world_size)]
    dist.all_gather(all_shape, shape_tensor)
    ## compute largest shapes
    all_shape = [x.cpu().numpy() for x in all_shape]
    all_count = [int(x.prod()) for x in all_shape]
    all_shape = [list(map(int, x)) for x in all_shape]
    max_count = max(all_count)
    ## padding tensors and gather them
    output_tensors = [torch.Tensor(max_count).cuda() for i in range(world_size)]
    padded_input_array = np.zeros(max_count)
    padded_input_array[:mycount] = input_array.reshape(-1)
    input_tensor = torch.Tensor(padded_input_array).cuda()
    dist.all_gather(output_tensors, input_tensor)
    ## unpadding gathered tensors
    padded_output = [x.cpu().numpy() for x in output_tensors]
    output = [x[:all_count[i]].reshape(all_shape[i]) for i,x in enumerate(padded_output)]
    return output

def gather_tensors_batch(input_array, part_size=10):
    # gather
    rank = dist.get_rank()
    all_features = []
    part_num = input_array.shape[0] // part_size + 1 if input_array.shape[0] % part_size != 0 else input_array.shape[0] // part_size
    for i in range(part_num):
        part_feat = input_array[i * part_size:min((i+1)*part_size, input_array.shape[0]),...]
        assert part_feat.shape[0] > 0, "rank: {}, length of part features should > 0".format(rank)
        print("rank: {}, gather part: {}/{}, length: {}".format(rank, i, part_num, len(part_feat)))
        gather_part_feat = gather_tensors(part_feat)
        all_features.append(gather_part_feat)
    print("rank: {}, gather done.".format(rank))
    all_features = np.concatenate([np.concatenate([all_features[i][j] for i in range(part_num)], axis=0) for j in range(len(all_features[0]))], axis=0)
    return all_features

def reduce_tensors(tensor):
    reduced_tensor = tensor.clone()
    dist.all_reduce(reduced_tensor)
    return reduced_tensor

class DistributedSequentialSampler(Sampler):
    def __init__(self, dataset, world_size=None, rank=None):
        if world_size == None:
            world_size = dist.get_world_size()
        if rank == None:
            rank = dist.get_rank()
        self.dataset = dataset
        self.world_size = world_size
        self.rank = rank
        assert len(self.dataset) >= self.world_size, '{} vs {}'.format(len(self.dataset), self.world_size)
        sub_num = int(math.ceil(len(self.dataset) * 1.0 / self.world_size))
        self.beg = sub_num * self.rank
        #self.end = min(self.beg+sub_num, len(self.dataset))
        self.end = self.beg + sub_num
        self.padded_ind = list(range(len(self.dataset))) + list(range(sub_num * self.world_size - len(self.dataset)))

    def __iter__(self):
        indices = [self.padded_ind[i] for i in range(self.beg, self.end)]
        return iter(indices)

    def __len__(self):
        return self.end - self.beg

class GivenIterationSampler(Sampler):
    def __init__(self, dataset, total_iter, batch_size, last_iter=-1):
        self.dataset = dataset
        self.total_iter = total_iter
        self.batch_size = batch_size
        self.last_iter = last_iter

        self.total_size = self.total_iter * self.batch_size
        self.indices = self.gen_new_list()
        self.call = 0

    def __iter__(self):
        if self.call == 0:
            self.call = 1
            return iter(self.indices[(self.last_iter + 1) * self.batch_size:])
        else:
            raise RuntimeError("this sampler is not designed to be called more than once!!")

    def gen_new_list(self):

        # each process shuffle all list with same seed, and pick one piece according to rank
        np.random.seed(0)

        all_size = self.total_size
        indices = np.arange(len(self.dataset))
        indices = indices[:all_size]
        num_repeat = (all_size-1) // indices.shape[0] + 1
        indices = np.tile(indices, num_repeat)
        indices = indices[:all_size]

        np.random.shuffle(indices)

        assert len(indices) == self.total_size

        return indices

    def __len__(self):
        return self.total_size


class DistributedGivenIterationSampler(Sampler):
    def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1):
        if world_size is None:
            world_size = dist.get_world_size()
        if rank is None:
            rank = dist.get_rank()
        assert rank < world_size
        self.dataset = dataset
        self.total_iter = total_iter
        self.batch_size = batch_size
        self.world_size = world_size
        self.rank = rank
        self.last_iter = last_iter

        self.total_size = self.total_iter*self.batch_size

        self.indices = self.gen_new_list()
        self.call = 0

    def __iter__(self):
        if self.call == 0:
            self.call = 1
            return iter(self.indices[(self.last_iter+1)*self.batch_size:])
        else:
            raise RuntimeError("this sampler is not designed to be called more than once!!")

    def gen_new_list(self):

        # each process shuffle all list with same seed, and pick one piece according to rank
        np.random.seed(0)

        all_size = self.total_size * self.world_size
        indices = np.arange(len(self.dataset))
        indices = indices[:all_size]
        num_repeat = (all_size-1) // indices.shape[0] + 1
        indices = np.tile(indices, num_repeat)
        indices = indices[:all_size]

        np.random.shuffle(indices)
        beg = self.total_size * self.rank
        indices = indices[beg:beg+self.total_size]

        assert len(indices) == self.total_size

        return indices

    def __len__(self):
        # note here we do not take last iter into consideration, since __len__
        # should only be used for displaying, the correct remaining size is
        # handled by dataloader
        #return self.total_size - (self.last_iter+1)*self.batch_size
        return self.total_size


================================================
FILE: mimicmotion/modules/cmp/utils/flowlib.py
================================================
#!/usr/bin/python
"""
# ==============================
# flowlib.py
# library for optical flow processing
# Author: Ruoteng Li
# Date: 6th Aug 2016
# ==============================
"""
#import png
import numpy as np
from PIL import Image
import io

UNKNOWN_FLOW_THRESH = 1e7
SMALLFLOW = 0.0
LARGEFLOW = 1e8

"""
=============
Flow Section
=============
"""

def write_flow(flow, filename):
    """
    write optical flow in Middlebury .flo format
    :param flow: optical flow map
    :param filename: optical flow file path to be saved
    :return: None
    """
    f = open(filename, 'wb')
    magic = np.array([202021.25], dtype=np.float32)
    (height, width) = flow.shape[0:2]
    w = np.array([width], dtype=np.int32)
    h = np.array([height], dtype=np.int32)
    magic.tofile(f)
    w.tofile(f)
    h.tofile(f)
    flow.tofile(f)
    f.close()


def save_flow_image(flow, image_file):
    """
    save flow visualization into image file
    :param flow: optical flow data
    :param flow_fil
    :return: None
    """
    flow_img = flow_to_image(flow)
    img_out = Image.fromarray(flow_img)
    img_out.save(image_file)

def segment_flow(flow):
    h = flow.shape[0]
    w = flow.shape[1]
    u = flow[:, :, 0]
    v = flow[:, :, 1]

    idx = ((abs(u) > LARGEFLOW) | (abs(v) > LARGEFLOW))
    idx2 = (abs(u) == SMALLFLOW)
    class0 = (v == 0) & (u == 0)
    u[idx2] = 0.00001
    tan_value = v / u

    class1 = (tan_value < 1) & (tan_value >= 0) & (u > 0) & (v >= 0)
    class2 = (tan_value >= 1) & (u >= 0) & (v >= 0)
    class3 = (tan_value < -1) & (u <= 0) & (v >= 0)
    class4 = (tan_value < 0) & (tan_value >= -1) & (u < 0) & (v >= 0)
    class8 = (tan_value >= -1) & (tan_value < 0) & (u > 0) & (v <= 0)
    class7 = (tan_value < -1) & (u >= 0) & (v <= 0)
    class6 = (tan_value >= 1) & (u <= 0) & (v <= 0)
    class5 = (tan_value >= 0) & (tan_value < 1) & (u < 0) & (v <= 0)

    seg = np.zeros((h, w))

    seg[class1] = 1
    seg[class2] = 2
    seg[class3] = 3
    seg[class4] = 4
    seg[class5] = 5
    seg[class6] = 6
    seg[class7] = 7
    seg[class8] = 8
    seg[class0] = 0
    seg[idx] = 0

    return seg

def flow_to_image(flow):
    """
    Convert flow into middlebury color code image
    :param flow: optical flow map
    :return: optical flow image in middlebury color
    """
    u = flow[:, :, 0]
    v = flow[:, :, 1]

    maxu = -999.
    maxv = -999.
    minu = 999.
    minv = 999.

    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
    u[idxUnknow] = 0
    v[idxUnknow] = 0

    maxu = max(maxu, np.max(u))
    minu = min(minu, np.min(u))

    maxv = max(maxv, np.max(v))
    minv = min(minv, np.min(v))

    rad = np.sqrt(u ** 2 + v ** 2)
    maxrad = max(5, np.max(rad))
    #maxrad = max(-1, 99)

    u = u/(maxrad + np.finfo(float).eps)
    v = v/(maxrad + np.finfo(float).eps)

    img = compute_color(u, v)

    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
    img[idx] = 0

    return np.uint8(img)

def disp_to_flowfile(disp, filename):
    """
    Read KITTI disparity file in png format
    :param disp: disparity matrix
    :param filename: the flow file name to save
    :return: None
    """
    f = open(filename, 'wb')
    magic = np.array([202021.25], dtype=np.float32)
    (height, width) = disp.shape[0:2]
    w = np.array([width], dtype=np.int32)
    h = np.array([height], dtype=np.int32)
    empty_map = np.zeros((height, width), dtype=np.float32)
    data = np.dstack((disp, empty_map))
    magic.tofile(f)
    w.tofile(f)
    h.tofile(f)
    data.tofile(f)
    f.close()

def compute_color(u, v):
    """
    compute optical flow color map
    :param u: optical flow horizontal map
    :param v: optical flow vertical map
    :return: optical flow in color code
    """
    [h, w] = u.shape
    img = np.zeros([h, w, 3])
    nanIdx = np.isnan(u) | np.isnan(v)
    u[nanIdx] = 0
    v[nanIdx] = 0

    colorwheel = make_color_wheel()
    ncols = np.size(colorwheel, 0)

    rad = np.sqrt(u**2+v**2)

    a = np.arctan2(-v, -u) / np.pi

    fk = (a+1) / 2 * (ncols - 1) + 1

    k0 = np.floor(fk).astype(int)

    k1 = k0 + 1
    k1[k1 == ncols+1] = 1
    f = fk - k0

    for i in range(0, np.size(colorwheel,1)):
        tmp = colorwheel[:, i]
        col0 = tmp[k0-1] / 255
        col1 = tmp[k1-1] / 255
        col = (1-f) * col0 + f * col1

        idx = rad <= 1
        col[idx] = 1-rad[idx]*(1-col[idx])
        notidx = np.logical_not(idx)

        col[notidx] *= 0.75
        img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx)))

    return img


def make_color_wheel():
    """
    Generate color wheel according Middlebury color code
    :return: Color wheel
    """
    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR

    colorwheel = np.zeros([ncols, 3])

    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY))
    col += RY

    # YG
    colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG))
    colorwheel[col:col+YG, 1] = 255
    col += YG

    # GC
    colorwheel[col:col+GC, 1] = 255
    colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC))
    col += GC

    # CB
    colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB))
    colorwheel[col:col+CB, 2] = 255
    col += CB

    # BM
    colorwheel[col:col+BM, 2] = 255
    colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM))
    col += + BM

    # MR
    colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
    colorwheel[col:col+MR, 0] = 255

    return colorwheel


def read_flo_file(filename, memcached=False):
    """
    Read from Middlebury .flo file
    :param flow_file: name of the flow file
    :return: optical flow data in matrix
    """
    if memcached:
        filename = io.BytesIO(filename)
    f = open(filename, 'rb')
    magic = np.fromfile(f, np.float32, count=1)[0]
    data2d = None

    if 202021.25 != magic:
        print('Magic number incorrect. Invalid .flo file')
    else:
        w = np.fromfile(f, np.int32, count=1)[0]
        h = np.fromfile(f, np.int32, count=1)[0]
        data2d = np.fromfile(f, np.float32, count=2 * w * h)
        # reshape data into 3D array (columns, rows, channels)
        data2d = np.resize(data2d, (h, w, 2))
    f.close()
    return data2d


# fast resample layer
def resample(img, sz):
    """
    img: flow map to be resampled
    sz: new flow map size. Must be [height,weight]
    """
    original_image_size = img.shape
    in_height = img.shape[0]
    in_width = img.shape[1]
    out_height = sz[0]
    out_width = sz[1]
    out_flow = np.zeros((out_height, out_width, 2))
    # find scale
    height_scale =  float(in_height) / float(out_height)
    width_scale =  float(in_width) / float(out_width)

    [x,y] = np.meshgrid(range(out_width), range(out_height))
    xx = x * width_scale
    yy = y * height_scale
    x0 = np.floor(xx).astype(np.int32)
    x1 = x0 + 1
    y0 = np.floor(yy).astype(np.int32)
    y1 = y0 + 1

    x0 = np.clip(x0,0,in_width-1)
    x1 = np.clip(x1,0,in_width-1)
    y0 = np.clip(y0,0,in_height-1)
    y1 = np.clip(y1,0,in_height-1)

    Ia = img[y0,x0,:]
    Ib = img[y1,x0,:]
    Ic = img[y0,x1,:]
    Id = img[y1,x1,:]

    wa = (y1-yy) * (x1-xx)
    wb = (yy-y0) * (x1-xx)
    wc = (y1-yy) * (xx-x0)
    wd = (yy-y0) * (xx-x0)
    out_flow[:,:,0] = (Ia[:,:,0]*wa + Ib[:,:,0]*wb + Ic[:,:,0]*wc + Id[:,:,0]*wd) * out_width / in_width
    out_flow[:,:,1] = (Ia[:,:,1]*wa + Ib[:,:,1]*wb + Ic[:,:,1]*wc + Id[:,:,1]*wd) * out_height / in_height

    return out_flow


================================================
FILE: mimicmotion/modules/cmp/utils/scheduler.py
================================================
import torch
from bisect import bisect_right

class _LRScheduler(object):
    def __init__(self, optimizer, last_iter=-1):
        if not isinstance(optimizer, torch.optim.Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        if last_iter == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        self.last_iter = last_iter

    def _get_new_lr(self):
        raise NotImplementedError

    def get_lr(self):
        return list(map(lambda group: group['lr'], self.optimizer.param_groups))

    def step(self, this_iter=None):
        if this_iter is None:
            this_iter = self.last_iter + 1
        self.last_iter = this_iter
        for param_group, lr in zip(self.optimizer.param_groups, self._get_new_lr()):
            param_group['lr'] = lr

class _WarmUpLRSchedulerOld(_LRScheduler):

    def __init__(self, optimizer, base_lr, warmup_lr, warmup_steps, last_iter=-1):
        self.base_lr = base_lr
        self.warmup_steps = warmup_steps
        if warmup_steps == 0:
            self.warmup_lr = base_lr
        else:
            self.warmup_lr = warmup_lr
        super(_WarmUpLRSchedulerOld, self).__init__(optimizer, last_iter)
    
    def _get_warmup_lr(self):
        if self.warmup_steps > 0 and self.last_iter < self.warmup_steps:
            # first compute relative scale for self.base_lr, then multiply to base_lr
            scale = ((self.last_iter/self.warmup_steps)*(self.warmup_lr - self.base_lr) + self.base_lr)/self.base_lr
            #print('last_iter: {}, warmup_lr: {}, base_lr: {}, scale: {}'.format(self.last_iter, self.warmup_lr, self.base_lr, scale))
            return [scale * base_lr for base_lr in self.base_lrs]
        else:
            return None

class _WarmUpLRScheduler(_LRScheduler):

    def __init__(self, optimizer, base_lr, warmup_lr, warmup_steps, last_iter=-1):
        self.base_lr = base_lr
        self.warmup_lr = warmup_lr
        self.warmup_steps = warmup_steps
        assert isinstance(warmup_lr, list)
        assert isinstance(warmup_steps, list)
        assert len(warmup_lr) == len(warmup_steps)
        super(_WarmUpLRScheduler, self).__init__(optimizer, last_iter)
    
    def _get_warmup_lr(self):
        pos = bisect_right(self.warmup_steps, self.last_iter)
        if pos >= len(self.warmup_steps):
            return None
        else:
            if pos == 0:
                curr_lr = self.base_lr + self.last_iter * (self.warmup_lr[pos] - self.base_lr) / self.warmup_steps[pos]
            else:
                curr_lr = self.warmup_lr[pos - 1] + (self.last_iter - self.warmup_steps[pos - 1]) * (self.warmup_lr[pos] - self.warmup_lr[pos - 1]) / (self.warmup_steps[pos] - self.warmup_steps[pos - 1])
        scale = curr_lr / self.base_lr
        return [scale * base_lr for base_lr in self.base_lrs]

class StepLRScheduler(_WarmUpLRScheduler):
    def __init__(self, optimizer, milestones, lr_mults, base_lr, warmup_lr, warmup_steps, last_iter=-1):
        super(StepLRScheduler, self).__init__(optimizer, base_lr, warmup_lr, warmup_steps, last_iter)

        assert len(milestones) == len(lr_mults), "{} vs {}".format(milestones, lr_mults)
        for x in milestones:
            assert isinstance(x, int)
        if not list(milestones) == sorted(milestones):
            raise ValueError('Milestones should be a list of'
                             ' increasing integers. Got {}', milestones)
        self.milestones = milestones
        self.lr_mults = [1.0]
        for x in lr_mults:
            self.lr_mults.append(self.lr_mults[-1]*x)
    
    def _get_new_lr(self):
        warmup_lrs = self._get_warmup_lr()
        if warmup_lrs is not None:
            return warmup_lrs

        pos = bisect_right(self.milestones, self.last_iter)
        if len(self.warmup_lr) == 0:
            scale = self.lr_mults[pos]
        else:
            scale = self.warmup_lr[-1] * self.lr_mults[pos] / self.base_lr
        return [base_lr * scale for base_lr in self.base_lrs]


================================================
FILE: mimicmotion/modules/cmp/utils/visualize_utils.py
================================================
import numpy as np

import torch
from . import flowlib

class Fuser(object):
    def __init__(self, nbins, fmax):
        self.nbins = nbins
        self.fmax = fmax
        self.step = 2 * fmax / float(nbins)
        self.mesh = torch.arange(nbins).view(1,-1,1,1).float().cuda() * self.step - fmax + self.step / 2

    def convert_flow(self, flow_prob):
        flow_probx = torch.nn.functional.softmax(flow_prob[:, :self.nbins, :, :], dim=1)
        flow_proby = torch.nn.functional.softmax(flow_prob[:, self.nbins:, :, :], dim=1)
        flow_probx = flow_probx * self.mesh
        flow_proby = flow_proby * self.mesh
        flow = torch.cat([flow_probx.sum(dim=1, keepdim=True), flow_proby.sum(dim=1, keepdim=True)], dim=1)
        return flow

def visualize_tensor_old(image, mask, flow_pred, flow_target, warped, rgb_gen, image_target, image_mean, image_div):
    together = [
        draw_cross(unormalize(image.cpu(), mean=image_mean, div=image_div), mask.cpu(), radius=int(image.size(3) / 50.)),
        flow_to_image(flow_pred.detach().cpu()),
        flow_to_image(flow_target.detach().cpu())]
    if warped is not None:
        together.append(torch.clamp(unormalize(warped.detach().cpu(), mean=image_mean, div=image_div), 0, 255))
    if rgb_gen is not None:
        together.append(torch.clamp(unormalize(rgb_gen.detach().cpu(), mean=image_mean, div=image_div), 0, 255))
    if image_target is not None:
        together.append(torch.clamp(unormalize(image_target.cpu(), mean=image_mean, div=image_div), 0, 255))
    together = torch.cat(together, dim=3)
    return together

def visualize_tensor(image, mask, flow_tensors, common_tensors, rgb_tensors, image_mean, image_div):
    together = [
        draw_cross(unormalize(image.cpu(), mean=image_mean, div=image_div), mask.cpu(), radius=int(image.size(3) / 50.))]
    for ft in flow_tensors:
        together.append(flow_to_image(ft.cpu()))
    for ct in common_tensors:
        together.append(torch.clamp(ct.cpu(), 0, 255))
    for rt in rgb_tensors:
        together.append(torch.clamp(unormalize(rt.cpu(), mean=image_mean, div=image_div), 0, 255))
    together = torch.cat(together, dim=3)
    return together


def unormalize(tensor, mean, div):
    for c, (m, d) in enumerate(zip(mean, div)):
        tensor[:,c,:,:].mul_(d).add_(m)
    return tensor


def flow_to_image(flow):
    flow = flow.numpy()
    flow_img = np.array([flowlib.flow_to_image(fl.transpose((1,2,0))).transpose((2,0,1)) for fl in flow]).astype(np.float32)
    return torch.from_numpy(flow_img)

def shift_tensor(input, offh, offw):
    new = torch.zeros(input.size())
    h = input.size(2)
    w = input.size(3)
    new[:,:,max(0,offh):min(h,h+offh),max(0,offw):min(w,w+offw)] = input[:,:,max(0,-offh):min(h,h-offh),max(0,-offw):min(w,w-offw)]
    return new

def draw_block(mask, radius=5):
    '''
    input:  tensor (NxCxHxW)
    output: block_mask (Nx1xHxW)
    '''
    all_mask = []
    mask = mask[:,0:1,:,:]
    for offh in range(-radius, radius+1):
        for offw in range(-radius, radius+1):
            all_mask.append(shift_tensor(mask, offh, offw))
    block_mask = sum(all_mask)
    block_mask[block_mask > 0] = 1
    return block_mask

def expand_block(sparse, radius=5):
    '''
    input:  sparse (NxCxHxW)
    output: block_sparse (NxCxHxW)
    '''
    all_sparse = []
    for offh in range(-radius, radius+1):
        for offw in range(-radius, radius+1):
            all_sparse.append(shift_tensor(sparse, offh, offw))
    block_sparse = sum(all_sparse)
    return block_sparse

def draw_cross(tensor, mask, radius=5, thickness=2):
    '''
    input:  tensor (NxCxHxW)
            mask (NxXxHxW)
    output: new_tensor (NxCxHxW)
    '''
    all_mask = []
    mask = mask[:,0:1,:,:]
    for off in range(-radius, radius+1):
        for t in range(-thickness, thickness+1):
            all_mask.append(shift_tensor(mask, off, t))
            all_mask.append(shift_tensor(mask, t, off))
    cross_mask = sum(all_mask)
    new_tensor = tensor.clone()
    new_tensor[:,0:1,:,:][cross_mask > 0] = 255.0
    new_tensor[:,1:2,:,:][cross_mask > 0] = 0.0
    new_tensor[:,2:3,:,:][cross_mask > 0] = 0.0
    return new_tensor


================================================
FILE: mimicmotion/modules/cmp_model.py
================================================
from typing import Any, Dict, List, Optional, Tuple, Union
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F

from diffusers.configuration_utils import register_to_config
from diffusers.utils import BaseOutput

import mimicmotion.modules.cmp.models as cmp_models
import mimicmotion.modules.cmp.utils as cmp_utils

import yaml
import os
import torchvision.transforms as transforms


class ArgObj(object):
    def __init__(self):
        pass


class CMP(nn.Module):
    def __init__(self, configfn, load_iter):
        super().__init__()
        args = ArgObj()
        with open(configfn) as f:
            config = yaml.full_load(f)
        for k, v in config.items():
            setattr(args, k, v)
        setattr(args, 'load_iter', load_iter)
        setattr(args, 'exp_path', os.path.dirname(configfn))
        
        self.model = cmp_models.__dict__[args.model['arch']](args.model, dist_model=False)
        self.model.load_state("{}/checkpoints".format(args.exp_path), args.load_iter, False)        
        self.model.switch_to('eval')
        
        self.data_mean = args.data['data_mean']
        self.data_div = args.data['data_div']
        
        self.img_transform = transforms.Compose([
            transforms.Normalize(self.data_mean, self.data_div)])
        
        self.args = args
        self.fuser = cmp_utils.Fuser(args.model['module']['nbins'], args.model['module']['fmax'])
        torch.cuda.synchronize()

    def run(self, image, sparse, mask):
        dtype = image.dtype
        image = image * 2 - 1
        self.model.set_input(image.float(), torch.cat([sparse, mask], dim=1).float(), None)
        try:
            cmp_output = self.model.model(self.model.image_input.to(torch.float16), self.model.sparse_input.to(torch.float16))
        except:
            cmp_output = self.model.model(self.model.image_input.to(torch.float32), self.model.sparse_input.to(torch.float32))
        flow = self.fuser.convert_flow(cmp_output)
        if flow.shape[2] != self.model.image_input.shape[2]:
            flow = nn.functional.interpolate(
                flow, size=self.model.image_input.shape[2:4],
                mode="bilinear", align_corners=True)

        return flow.to(dtype)  # [b, 2, h, w]

================================================
FILE: mimicmotion/modules/controlnet.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
from torch import nn, einsum
from torch.nn import functional as F

from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.loaders import FromOriginalControlNetMixin
from diffusers.utils import BaseOutput, logging
from diffusers.models.attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
    CROSS_ATTENTION_PROCESSORS,
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
)
from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
from diffusers.models.modeling_utils import ModelMixin
from diffusers.models.unets.unet_3d_blocks  import (
    get_down_block, get_up_block,UNetMidBlockSpatioTemporal,
)
from diffusers.models import UNetSpatioTemporalConditionModel
from .point_adapter import PointAdapter
from einops import rearrange, repeat
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
import pdb  
from inspect import isfunction

def exists(val):
    return val is not None

def default(val, d):
    if exists(val):
        return val
    return d() if isfunction(d) else d

@dataclass
class ControlNetOutput(BaseOutput):
    """
    The output of [`ControlNetModel`].

    Args:
        down_block_res_samples (`tuple[torch.Tensor]`):
            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
            used to condition the original UNet's downsampling activations.
        mid_down_block_re_sample (`torch.Tensor`):
            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
            Output can be used to condition the original UNet's middle block activation.
    """

    down_block_res_samples: Tuple[torch.Tensor]
    mid_block_res_sample: torch.Tensor


class ControlNetConditioningEmbeddingSVD(nn.Module):
    """
    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
    model) to encode image-space conditions ... into feature maps ..."
    """

    def __init__(
        self,
        conditioning_embedding_channels: int,
        conditioning_channels: int = 3,
        flow_channels: int = 2,
        dift_channels: int = 640,
        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
        feature_out_channels: Tuple[int, ...] = (160, 160, 256, 256),
    ):
        super().__init__()

        self.conv_in_flow = nn.Conv2d(flow_channels, block_out_channels[0], kernel_size=3, padding=1)

        self.blocks_flow = nn.ModuleList([])

        for i in range(len(block_out_channels) - 1):
            channel_in = block_out_channels[i]
            channel_out = block_out_channels[i + 1]
            self.blocks_flow.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
            self.blocks_flow.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))

        self.conv_out_flow = zero_module(
            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
        )

        self.conv_in_traj = nn.Conv2d(flow_channels, block_out_channels[0], kernel_size=3, padding=1)

        self.blocks_traj = nn.ModuleList([])

        for i in range(len(block_out_channels) - 1):
            channel_in = block_out_channels[i]
            channel_out = block_out_channels[i + 1]
            self.blocks_traj.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
            self.blocks_traj.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))

        self.conv_out_traj = zero_module(
            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
        )
        
        self.conv_final = nn.Conv2d(conditioning_embedding_channels, conditioning_embedding_channels, kernel_size=3, padding=1)

        
    def forward(self, flow_conditioning, traj_conditioning):

        # flow cond ###########
        zero_flow = torch.zeros_like(flow_conditioning[:, 0, :, :, :]).to(flow_conditioning.device, flow_conditioning.dtype)
        flow_conditioning = torch.cat([zero_flow.unsqueeze(1),flow_conditioning], dim=1)
        flow_conditioning = rearrange(flow_conditioning, "b f c h w -> (b f) c h w")
        
        embedding_flow = self.conv_in_flow(flow_conditioning)
        embedding_flow = F.silu(embedding_flow)

        for block_flow in self.blocks_flow:
            embedding_flow = block_flow(embedding_flow)
            embedding_flow = F.silu(embedding_flow)

        embedding_flow = self.conv_out_flow(embedding_flow)

        # traj cond ###########
        zero_traj = torch.zeros_like(traj_conditioning[:, 0, :, :, :]).to(traj_conditioning.device, traj_conditioning.dtype)
        traj_conditioning = torch.cat([zero_traj.unsqueeze(1),traj_conditioning], dim=1)
        traj_conditioning = rearrange(traj_conditioning, "b f c h w -> (b f) c h w")
        
        embedding_traj = self.conv_in_traj(traj_conditioning)
        embedding_traj = F.silu(embedding_traj)

        for block_traj in self.blocks_traj:
            embedding_traj = block_traj(embedding_traj)
            embedding_traj = F.silu(embedding_traj)

        embedding_traj = self.conv_out_traj(embedding_traj)

        embedding = self.conv_final(embedding_flow + embedding_traj)

        return embedding


class ControlNetSVDModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
    r"""
    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
    shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
            The tuple of downsample blocks to use.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        addition_time_embed_dim: (`int`, defaults to 256):
            Dimension to to encode the additional time ids.
        projection_class_embeddings_input_dim (`int`, defaults to 768):
            The dimension of the projection of encoded `added_time_ids`.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
            The dimension of the cross attention features.
        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
            The number of attention heads.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    """

    _supports_gradient_checkpointing = True

    @register_to_config
    def __init__(
        self,
        sample_size: Optional[int] = None,
        in_channels: int = 8,
        out_channels: int = 4,
        down_block_types: Tuple[str] = (
            "CrossAttnDownBlockSpatioTemporal",
            "CrossAttnDownBlockSpatioTemporal",
            "CrossAttnDownBlockSpatioTemporal",
            "DownBlockSpatioTemporal",
        ),
        up_block_types: Tuple[str] = (
            "UpBlockSpatioTemporal",
            "CrossAttnUpBlockSpatioTemporal",
            "CrossAttnUpBlockSpatioTemporal",
            "CrossAttnUpBlockSpatioTemporal",
        ),
        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
        addition_time_embed_dim: int = 256,
        projection_class_embeddings_input_dim: int = 768,
        layers_per_block: Union[int, Tuple[int]] = 2,
        cross_attention_dim: Union[int, Tuple[int]] = 1024,
        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
        num_frames: int = 25,
        conditioning_channels: int = 3,
        conditioning_embedding_out_channels : Optional[Tuple[int, ...]] = (16, 32, 96, 256),
    ):
        super().__init__()
        self.sample_size = sample_size

        print("layers per block is", layers_per_block)
        
        # Check inputs
        if len(down_block_types) != len(up_block_types):
            raise ValueError(
                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
            )

        if len(block_out_channels) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
            )

        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
            )

        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
            )

        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
            )

        # input
        self.conv_in = nn.Conv2d(
            in_channels,
            block_out_channels[0],
            kernel_size=3,
            padding=1,
        )

        # time
        time_embed_dim = block_out_channels[0] * 4

        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
        timestep_input_dim = block_out_channels[0]

        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)

        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)

        self.down_blocks = nn.ModuleList([])
        self.controlnet_down_blocks = nn.ModuleList([])

        if isinstance(num_attention_heads, int):
            num_attention_heads = (num_attention_heads,) * len(down_block_types)

        if isinstance(cross_attention_dim, int):
            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

        if isinstance(layers_per_block, int):
            layers_per_block = [layers_per_block] * len(down_block_types)

        if isinstance(transformer_layers_per_block, int):
            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)

        blocks_time_embed_dim = time_embed_dim
        self.point_adapter = PointAdapter()

        self.controlnet_cond_embedding = ControlNetConditioningEmbeddingSVD(
            conditioning_embedding_channels=block_out_channels[0],
            block_out_channels=conditioning_embedding_out_channels,
            conditioning_channels=conditioning_channels,
        )
        
        # down
        output_channel = block_out_channels[0]
        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
        controlnet_block = zero_module(controlnet_block)
        self.controlnet_down_blocks.append(controlnet_block)

        
        for i, down_block_type in enumerate(down_block_types):
            input_channel = output_channel
            output_channel = block_out_channels[i]
            is_final_block = i == len(block_out_channels) - 1

            down_block = get_down_block(
                down_block_type,
                num_layers=layers_per_block[i],
                transformer_layers_per_block=transformer_layers_per_block[i],
                in_channels=input_channel,
                out_channels=output_channel,
                temb_channels=blocks_time_embed_dim,
                add_downsample=not is_final_block,
                resnet_eps=1e-5,
                cross_attention_dim=cross_attention_dim[i],
                num_attention_heads=num_attention_heads[i],
                resnet_act_fn="silu",
            )
            self.down_blocks.append(down_block)
            
            for _ in range(layers_per_block[i]):
                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
                controlnet_block = zero_module(controlnet_block)
                self.controlnet_down_blocks.append(controlnet_block)

            if not is_final_block:
                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
                controlnet_block = zero_module(controlnet_block)
                self.controlnet_down_blocks.append(controlnet_block)


        # mid
        mid_block_channel = block_out_channels[-1]
        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
        controlnet_block = zero_module(controlnet_block)
        self.controlnet_mid_block = controlnet_block

        
        self.mid_block = UNetMidBlockSpatioTemporal(
            block_out_channels[-1],
            temb_channels=blocks_time_embed_dim,
            transformer_layers_per_block=transformer_layers_per_block[-1],
            cross_attention_dim=cross_attention_dim[-1],
            num_attention_heads=num_attention_heads[-1],
        )

    @property
    def attn_processors(self) -> Dict[str, AttentionProcessor]:
        r"""
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        """
        # set recursively
        processors = {}

        def fn_recursive_add_processors(
            name: str,
            module: torch.nn.Module,
            processors: Dict[str, AttentionProcessor],
        ):
            if hasattr(module, "get_processor"):
                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)

            for sub_name, child in module.named_children():
                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

            return processors

        for name, module in self.named_children():
            fn_recursive_add_processors(name, module, processors)

        return processors

    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
        r"""
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        """
        count = len(self.attn_processors.keys())

        if isinstance(processor, dict) and len(processor) != count:
            raise ValueError(
                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
            )

        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
            if hasattr(module, "set_processor"):
                if not isinstance(processor, dict):
                    module.set_processor(processor)
                else:
                    module.set_processor(processor.pop(f"{name}.processor"))

            for sub_name, child in module.named_children():
                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

    def set_default_attn_processor(self):
        """
        Disables custom attention processors and sets the default attention implementation.
        """
        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            processor = AttnProcessor()
        else:
            raise ValueError(
                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
            )

        self.set_attn_processor(processor)

    def _set_gradient_checkpointing(self, module, value=False):
        if hasattr(module, "gradient_checkpointing"):
            module.gradient_checkpointing = value

    # Copied from diffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
        """
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        """
        if dim not in [0, 1]:
            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")

        # By default chunk size is 1
        chunk_size = chunk_size or 1

        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
            if hasattr(module, "set_chunk_feed_forward"):
                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)

            for child in module.children():
                fn_recursive_feed_forward(child, chunk_size, dim)

        for module in self.children():
            fn_recursive_feed_forward(module, chunk_size, dim)
    
    def forward(
        self,
        sample: torch.FloatTensor,
        timestep: Union[torch.Tensor, float, int],
        encoder_hidden_states: torch.Tensor,
        added_time_ids: torch.Tensor,
        controlnet_cond: torch.FloatTensor = None,
        controlnet_flow: torch.FloatTensor = None,
        traj_flow: torch.FloatTensor = None,
        pose_image: torch.FloatTensor = None,
        pose_latents: torch.FloatTensor = None,
        dift_feat: torch.FloatTensor = None,
        # ref_point = None,
        point_list = None,
        ref_point_emb = None,
        image_only_indicator: Optional[torch.Tensor] = None,
        return_dict: bool = True,
        guess_mode: bool = False,
        conditioning_scale: float = 1.0,


    ) -> Union[ControlNetOutput, Tuple]:
        r"""
        The [`UNetSpatioTemporalConditionModel`] forward method.

        Args:
            sample (`torch.FloatTensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.FloatTensor`):
                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
            added_time_ids: (`torch.FloatTensor`):
                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
                embeddings and added to the time embeddings.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
                tuple.
        Returns:
            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
                a `tuple` is returned where the first element is the sample tensor.
        """
        # 1. time
        timesteps = timestep
        if not torch.is_tensor(timesteps):
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            # This would be a good case for the `match` statement (Python 3.10+)
            is_mps = sample.device.type == "mps"
            if isinstance(timestep, float):
                dtype = torch.float32 if is_mps else torch.float64
            else:
                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)

        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        batch_size, num_frames = sample.shape[:2]
        timesteps = timesteps.expand(batch_size)

        t_emb = self.time_proj(timesteps)

        # `Timesteps` does not contain any weights and will always return f32 tensors
        # but time_embedding might actually be running in fp16. so we need to cast here.
        # there might be better ways to encapsulate this.
        t_emb = t_emb.to(dtype=sample.dtype)

        # print(t_emb.dtype)

        emb = self.time_embedding(t_emb)

        time_embeds = self.add_time_proj(added_time_ids.flatten())
        time_embeds = time_embeds.reshape((batch_size, -1))
        time_embeds = time_embeds.to(emb.dtype)
        aug_emb = self.add_embedding(time_embeds)
        emb = emb + aug_emb

        # Flatten the batch and frames dimensions
        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
        sample = sample.flatten(0, 1)
        # Repeat the embeddings num_video_frames times
        # emb: [batch, channels] -> [batch * frames, channels]
        emb = emb.repeat_interleave(num_frames, dim=0)
        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)

        # 2. pre-process
        sample = self.conv_in(sample)

        bz, _, h, w = controlnet_cond.size()
        #controlnet cond

        if controlnet_flow is not None and traj_flow is not None:
            cond_flow = self.controlnet_cond_embedding(controlnet_flow, traj_flow)
            sample = sample + cond_flow

        # get dift feat
        adapter_state = []
        loss_mask = None
        if point_list is not None:
            adapter_state, loss_mask = self.point_adapter(point_list, (w,h), ref_point_emb, pose_latents, loss_type="local")
            if not self.training:
                for k, v in enumerate(adapter_state):
                    adapter_state[k] = torch.cat([v] * 2, dim=0)

        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)

        down_block_res_samples = (sample,)
        for downsample_block in self.down_blocks:
            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
                    encoder_hidden_states=encoder_hidden_states,
                    image_only_indicator=image_only_indicator,
                )
            else:
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
                    image_only_indicator=image_only_indicator,
                )
            if len(adapter_state) > 0:
                additional_residuals = adapter_state.pop(0)
                sample = sample + additional_residuals.flatten(0, 1)

            down_block_res_samples += res_samples


        # 4. mid
        sample = self.mid_block(
            hidden_states=sample,
            temb=emb,
            encoder_hidden_states=encoder_hidden_states,
            image_only_indicator=image_only_indicator,
        )

        controlnet_down_block_res_samples = ()

        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
            down_block_res_sample = controlnet_block(down_block_res_sample)
            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)

        down_block_res_samples = controlnet_down_block_res_samples

        mid_block_res_sample = self.controlnet_mid_block(sample)

        # 6. scaling

        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
        mid_block_res_sample = mid_block_res_sample * conditioning_scale

        if not return_dict:
            return (down_block_res_samples, mid_block_res_sample, loss_mask)

        return ControlNetOutput(
            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
        )
    

    @classmethod
    def from_unet(
        cls,
        unet: UNetSpatioTemporalConditionModel,
        controlnet_conditioning_channel_order: str = "rgb",
        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
        load_weights_from_unet: bool = True,
        conditioning_channels: int = 3,
    ):
        r"""
        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
                where applicable.
        """

        transformer_layers_per_block = (
            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
        )
        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
        addition_time_embed_dim = (
            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
        )
        print(unet.config)
        controlnet = cls(
            in_channels=unet.config.in_channels,
            down_block_types=unet.config.down_block_types,
            block_out_channels=unet.config.block_out_channels,
            addition_time_embed_dim=unet.config.addition_time_embed_dim,
            transformer_layers_per_block=unet.config.transformer_layers_per_block,
            cross_attention_dim=unet.config.cross_attention_dim,
            num_attention_heads=unet.config.num_attention_heads,
            num_frames=unet.config.num_frames,
            sample_size=unet.config.sample_size,  # Added based on the dict
            layers_per_block=unet.config.layers_per_block,
            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
            conditioning_channels = conditioning_channels,
            conditioning_embedding_out_channels = conditioning_embedding_out_channels,
        )
        #controlnet rgb channel order ignored, set to not makea  difference by default
        
        if load_weights_from_unet:
            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())

           # if controlnet.class_embedding:
           #     controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())

            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())

        return controlnet

    @property
    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
    def attn_processors(self) -> Dict[str, AttentionProcessor]:
        r"""
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        """
        # set recursively
        processors = {}

        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
            if hasattr(module, "get_processor"):
                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)

            for sub_name, child in module.named_children():
                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

            return processors

        for name, module in self.named_children():
            fn_recursive_add_processors(name, module, processors)

        return processors

    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
    def set_attn_processor(
        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
    ):
        r"""
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        """
        count = len(self.attn_processors.keys())

        if isinstance(processor, dict) and len(processor) != count:
            raise ValueError(
                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
            )

        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
            if hasattr(module, "set_processor"):
                if not isinstance(processor, dict):
                    module.set_processor(processor, _remove_lora=_remove_lora)
                else:
                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)

            for sub_name, child in module.named_children():
                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
    def set_default_attn_processor(self):
        """
        Disables custom attention processors and sets the default attention implementation.
        """
        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            processor = AttnAddedKVProcessor()
        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            processor = AttnProcessor()
        else:
            raise ValueError(
                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
            )

        self.set_attn_processor(processor, _remove_lora=True)

    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
        r"""
        Enable sliced attention computation.

        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
        several steps. This is useful for saving some memory in exchange for a small decrease in speed.

        Args:
            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
                must be a multiple of `slice_size`.
        """
        sliceable_head_dims = []

        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
            if hasattr(module, "set_attention_slice"):
                sliceable_head_dims.append(module.sliceable_head_dim)

            for child in module.children():
                fn_recursive_retrieve_sliceable_dims(child)

        # retrieve number of attention layers
        for module in self.children():
            fn_recursive_retrieve_sliceable_dims(module)

        num_sliceable_layers = len(sliceable_head_dims)

        if slice_size == "auto":
            # half the attention head size is usually a good trade-off between
            # speed and memory
            slice_size = [dim // 2 for dim in sliceable_head_dims]
        elif slice_size == "max":
            # make smallest slice possible
            slice_size = num_sliceable_layers * [1]

        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size

        if len(slice_size) != len(sliceable_head_dims):
            raise ValueError(
                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
            )

        for i in range(len(slice_size)):
            size = slice_size[i]
            dim = sliceable_head_dims[i]
            if size is not None and size > dim:
                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")

        # Recursively walk through all the children.
        # Any children which exposes the set_attention_slice method
        # gets the message
        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
            if hasattr(module, "set_attention_slice"):
                module.set_attention_slice(slice_size.pop())

            for child in module.children():
                fn_recursive_set_attention_slice(child, slice_size)

        reversed_slice_size = list(reversed(slice_size))
        for module in self.children():
            fn_recursive_set_attention_slice(module, reversed_slice_size)

 #   def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
 #       if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
  #          module.gradient_checkpointing = value

    
def zero_module(module):
    for p in module.parameters():
        nn.init.zeros_(p)
    return module


================================================
FILE: mimicmotion/modules/point_adapter.py
================================================
import random
from typing import List
from einops import rearrange, repeat

import torch
import torch.nn as nn
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.models.modeling_utils import ModelMixin
import pdb
import time

class MLP(nn.Module):
    def __init__(self, in_dim, out_dim, mid_dim=128):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, mid_dim, bias=True),
            nn.SiLU(inplace=False),
            nn.Linear(mid_dim, out_dim, bias=True)
        )

    def forward(self, x):
        return self.mlp(x)

def vectorized_bilinear_interpolation(level_adapter_state, coords, frame_idx, interpolated_values):
    x = coords[:, 0]
    y = coords[:, 1]

    x1 = x.floor().long()
    y1 = y.floor().long()
    x2 = x1 + 1
    y2 = y1 + 1

    x1 = torch.clamp(x1, 0, level_adapter_state.shape[3] - 1)
    y1 = torch.clamp(y1, 0, level_adapter_state.shape[2] - 1)
    x2 = torch.clamp(x2, 0, level_adapter_state.shape[3] - 1)
    y2 = torch.clamp(y2, 0, level_adapter_state.shape[2] - 1)

    x_frac = x - x1.float()
    y_frac = y - y1.float()

    w11 = (1 - x_frac) * (1 - y_frac)
    w21 = x_frac * (1 - y_frac)
    w12 = (1 - x_frac) * y_frac
    w22 = x_frac * y_frac

    for i, (x1_val, y1_val, x2_val, y2_val, w11_val, w21_val, w12_val, w22_val, interpolated_value) in enumerate(zip(x1, y1, x2, y2, w11, w21, w12, w22, interpolated_values)):
        level_adapter_state[frame_idx, :, y1_val, x1_val] += interpolated_value * w11_val
        level_adapter_state[frame_idx, :, y1_val, x2_val] += interpolated_value * w21_val
        level_adapter_state[frame_idx, :, y2_val, x1_val] += interpolated_value * w12_val
        level_adapter_state[frame_idx, :, y2_val, x2_val] += interpolated_value * w22_val

    return level_adapter_state

def bilinear_interpolation(level_adapter_state, x, y, frame_idx, interpolated_value):
    # note the boundary
    x1 = int(x)
    y1 = int(y)
    x2 = x1 + 1
    y2 = y1 + 1
    x_frac = x - x1
    y_frac = y - y1

    x1, x2 = max(min(x1, level_adapter_state.shape[3] - 1), 0), max(min(x2, level_adapter_state.shape[3] - 1), 0)
    y1, y2 = max(min(y1, level_adapter_state.shape[2] - 1), 0), max(min(y2, level_adapter_state.shape[2] - 1), 0)

    w11 = (1 - x_frac) * (1 - y_frac)
    w21 = x_frac * (1 - y_frac)
    w12 = (1 - x_frac) * y_frac
    w22 = x_frac * y_frac

    level_adapter_state[frame_idx, :, y1, x1] += interpolated_value * w11
    level_adapter_state[frame_idx, :, y1, x2] += interpolated_value * w21
    level_adapter_state[frame_idx, :, y2, x1] += interpolated_value * w12
    level_adapter_state[frame_idx, :, y2, x2] += interpolated_value * w22

    return level_adapter_state

class PointAdapter(nn.Module):

    def __init__(
        self,
        embedding_channels=1280,
        channels=[320, 640, 1280, 1280],
        downsample_rate=[16, 32, 64, 64],
        mid_dim=128
    ):
        super().__init__()

        self.model_list = nn.ModuleList()

        for ch in channels:
            self.model_list.append(MLP(embedding_channels, ch, mid_dim))

        self.downsample_rate = downsample_rate
        self.embedding_channels = embedding_channels
        self.channels = channels
        self.radius = 4

    def generate_loss_mask(self, batch_size, point_tracker, num_frames, h, w, loss_type):
        downsample_rate = self.downsample_rate[0]
        level_w, level_h = w // downsample_rate, h // downsample_rate
        if loss_type == 'global':
            loss_mask = torch.ones((batch_size, num_frames, 4, level_h, level_w))
        else:
            loss_mask = torch.zeros((batch_size, num_frames, 4, level_h, level_w))
            for batch_idx in range(batch_size):
                for frame_idx in range(num_frames):
                    if self.training:
                        keypoints, subsets = point_tracker[frame_idx]["candidate"][batch_idx], point_tracker[frame_idx]["subset"][batch_idx][0]
                    else:
                        keypoints, subsets = point_tracker[frame_idx]["candidate"], point_tracker[frame_idx]["subset"][0]
                        assert batch_size == 1
                    for point_idx, (keypoint, subset) in enumerate(zip(keypoints, subsets)):
                        if subset != -1:
                            px, py = keypoint[0] * level_w, keypoint[1] * level_h

                            x1 = int(px) - self.radius
                            y1 = int(py) - self.radius
                            x2 = int(px) + self.radius
                            y2 = int(py) + self.radius

                            x1, x2 = max(min(x1, level_w - 1), 0), max(min(x2, level_w - 1), 0)
                            y1, y2 = max(min(y1, level_h - 1), 0), max(min(y2, level_h - 1), 0)
                            loss_mask[batch_idx][frame_idx][:, y1:y2, x1:x2] = 1.0

        return loss_mask

    def forward(self, point_tracker, size, point_embedding, pose_latents, index_list=None, drop_rate=0.0, loss_type='global') -> List[torch.Tensor]:
        w, h = size
        num_frames = len(point_tracker)
        batch_size, num_points, _ = point_embedding.shape

        loss_mask = self.generate_loss_mask(batch_size, point_tracker, num_frames, h, w, loss_type)

        downsample_rate = self.downsample_rate[0]
        level_w, level_h = w // downsample_rate, h // downsample_rate
        level_adapter_state = torch.zeros((batch_size, num_frames, self.embedding_channels, level_h, level_w)).to(point_embedding.device, dtype=point_embedding.dtype)
        level_mask = torch.zeros((batch_size, num_frames, level_h, level_w)).to(point_embedding.device, dtype=point_embedding.dtype)
        level_count = torch.ones((batch_size, num_frames, level_h, level_w)).to(point_embedding.device, dtype=point_embedding.dtype)
        for batch_idx in range(batch_size):
            for frame_idx in range(num_frames):
                if self.training:
                    keypoints, subsets = point_tracker[frame_idx]["candidate"][batch_idx], point_tracker[frame_idx]["subset"][batch_idx][0]
                else:
                    keypoints, subsets = point_tracker[frame_idx]["candidate"], point_tracker[frame_idx]["subset"][0]
                    assert batch_size == 1
                for point_idx, (keypoint, subset) in enumerate(zip(keypoints, subsets)):
                    if keypoint.min() < 0:
                        continue
                    px, py = keypoint[0] * level_w, keypoint[1] * level_h
                    px, py = max(min(int(px), level_w - 1), 0), max(min(int(py), level_h - 1), 0)
                    if subset != -1:
                        if point_embedding[batch_idx, point_idx].mean() != 0 or random.random() > drop_rate:
                            if level_mask[batch_idx, frame_idx, py, px] !=0:
                                level_count[batch_idx, frame_idx, py, px] +=1
                            level_adapter_state[batch_idx, frame_idx, :, py, px] += point_embedding[batch_idx, point_idx]
                            level_mask[batch_idx, frame_idx, py, px] = 1.0
        
        adapter_state = []
        level_adapter_state = level_adapter_state/level_count.unsqueeze(2)
        level_adapter_state = rearrange(level_adapter_state, "b f c h w-> b f h w c")
        for level_idx, module in enumerate(self.model_list):
            downsample_rate = self.downsample_rate[level_idx]
            level_w, level_h = w // downsample_rate, h // downsample_rate

            point_feat = module(level_adapter_state)
            point_feat = point_feat * level_mask.unsqueeze(-1)

            point_feat = rearrange(point_feat, "b f h w c-> (b f) c h w")
            point_feat = nn.Upsample(size=(level_h, level_w), mode='bilinear')(point_feat)

            temp_mask = rearrange(level_mask, "b f h w-> (b f) h w")
            temp_mask = nn.Upsample(size=(level_h, level_w), mode='nearest')(temp_mask.unsqueeze(1))
            point_feat = point_feat * temp_mask

            point_feat = rearrange(point_feat, "(b f) c h w-> b f c h w", b=batch_size)
            adapter_state.append(point_feat)
        
        return adapter_state, loss_mask


================================================
FILE: mimicmotion/modules/pose_net.py
================================================
from pathlib import Path

import einops
import numpy as np
import torch
import torch.nn as nn
import torch.nn.init as init


class PoseNet(nn.Module):
    """a tiny conv network for introducing pose sequence as the condition
    """
    def __init__(self, noise_latent_channels=320, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # multiple convolution layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, padding=1),
            nn.SiLU(),
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=4, stride=2, padding=1),
            nn.SiLU(),

            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
            nn.SiLU(),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2, padding=1),
            nn.SiLU(),

            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1),
            nn.SiLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=1),
            nn.SiLU(),

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.SiLU(),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.SiLU()
        )

        # Final projection layer
        self.final_proj = nn.Conv2d(in_channels=128, out_channels=noise_latent_channels, kernel_size=1)

        # Initialize layers
        self._initialize_weights()

        self.scale = nn.Parameter(torch.ones(1) * 2)

    def _initialize_weights(self):
        """Initialize weights with He. initialization and zero out the biases
        """
        for m in self.conv_layers:
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
                init.normal_(m.weight, mean=0.0, std=np.sqrt(2. / n))
                if m.bias is not None:
                    init.zeros_(m.bias)
        init.zeros_(self.final_proj.weight)
        if self.final_proj.bias is not None:
            init.zeros_(self.final_proj.bias)

    def forward(self, x):
        if x.ndim == 5:
            x = einops.rearrange(x, "b f c h w -> (b f) c h w")
        x = self.conv_layers(x)
        x = self.final_proj(x)

        return x * self.scale

    @classmethod
    def from_pretrained(cls, pretrained_model_path):
        """load pretrained pose-net weights
        """
        if not Path(pretrained_model_path).exists():
            print(f"There is no model file in {pretrained_model_path}")
        print(f"loaded PoseNet's pretrained weights from {pretrained_model_path}.")

        state_dict = torch.load(pretrained_model_path, map_location="cpu")
        model = PoseNet(noise_latent_channels=320)

        model.load_state_dict(state_dict, strict=True)

        return model


================================================
FILE: mimicmotion/modules/unet.py
================================================
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union

import torch
import torch.nn as nn
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.loaders import UNet2DConditionLoadersMixin
from diffusers.models.attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
from diffusers.models.embeddings import TimestepEmbedding, Timesteps
from diffusers.models.modeling_utils import ModelMixin
from diffusers.utils import BaseOutput, logging

from diffusers.models.unets.unet_3d_blocks import get_down_block, get_up_block, UNetMidBlockSpatioTemporal

logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


@dataclass
class UNetSpatioTemporalConditionOutput(BaseOutput):
    """
    The output of [`UNetSpatioTemporalConditionModel`].

    Args:
        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    """

    sample: torch.FloatTensor = None


class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
    r"""
    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state,
    and a timestep and returns a sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", 
            "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
            The tuple of downsample blocks to use.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", 
            "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        addition_time_embed_dim: (`int`, defaults to 256):
            Dimension to to encode the additional time ids.
        projection_class_embeddings_input_dim (`int`, defaults to 768):
            The dimension of the projection of encoded `added_time_ids`.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
            The dimension of the cross attention features.
        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], 
            [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
            The number of attention heads.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    """

    _supports_gradient_checkpointing = True

    @register_to_config
    def __init__(
            self,
            sample_size: Optional[int] = None,
            in_channels: int = 8,
            out_channels: int = 4,
            down_block_types: Tuple[str] = (
                    "CrossAttnDownBlockSpatioTemporal",
                    "CrossAttnDownBlockSpatioTemporal",
                    "CrossAttnDownBlockSpatioTemporal",
                    "DownBlockSpatioTemporal",
            ),
            up_block_types: Tuple[str] = (
                    "UpBlockSpatioTemporal",
                    "CrossAttnUpBlockSpatioTemporal",
                    "CrossAttnUpBlockSpatioTemporal",
                    "CrossAttnUpBlockSpatioTemporal",
            ),
            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
            addition_time_embed_dim: int = 256,
            projection_class_embeddings_input_dim: int = 768,
            layers_per_block: Union[int, Tuple[int]] = 2,
            cross_attention_dim: Union[int, Tuple[int]] = 1024,
            transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
            num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
            num_frames: int = 25,
    ):
        super().__init__()

        self.sample_size = sample_size

        # Check inputs
        if len(down_block_types) != len(up_block_types):
            raise ValueError(
                f"Must provide the same number of `down_block_types` as `up_block_types`. " \
                f"`down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
            )

        if len(block_out_channels) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `block_out_channels` as `down_block_types`. " \
                f"`block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
            )

        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `num_attention_heads` as `down_block_types`. " \
                f"`num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
            )

        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. " \
                f"`cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
            )

        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `layers_per_block` as `down_block_types`. " \
                f"`layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
            )

        # input
        self.conv_in = nn.Conv2d(
            in_channels,
            block_out_channels[0],
            kernel_size=3,
            padding=1,
        )

        # time
        time_embed_dim = block_out_channels[0] * 4

        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
        timestep_input_dim = block_out_channels[0]

        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)

        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)

        self.down_blocks = nn.ModuleList([])
        self.up_blocks = nn.ModuleList([])

        if isinstance(num_attention_heads, int):
            num_attention_heads = (num_attention_heads,) * len(down_block_types)

        if isinstance(cross_attention_dim, int):
            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

        if isinstance(layers_per_block, int):
            layers_per_block = [layers_per_block] * len(down_block_types)

        if isinstance(transformer_layers_per_block, int):
            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)

        blocks_time_embed_dim = time_embed_dim

        # down
        output_channel = block_out_channels[0]
        for i, down_block_type in enumerate(down_block_types):
            input_channel = output_channel
            output_channel = block_out_channels[i]
            is_final_block = i == len(block_out_channels) - 1

            down_block = get_down_block(
                down_block_type,
                num_layers=layers_per_block[i],
                transformer_layers_per_block=transformer_layers_per_block[i],
                in_channels=input_channel,
                out_channels=output_channel,
                temb_channels=blocks_time_embed_dim,
                add_downsample=not is_final_block,
                resnet_eps=1e-5,
                cross_attention_dim=cross_attention_dim[i],
                num_attention_heads=num_attention_heads[i],
                resnet_act_fn="silu",
            )
            self.down_blocks.append(down_block)

        # mid
        self.mid_block = UNetMidBlockSpatioTemporal(
            block_out_channels[-1],
            temb_channels=blocks_time_embed_dim,
            transformer_layers_per_block=transformer_layers_per_block[-1],
            cross_attention_dim=cross_attention_dim[-1],
            num_attention_heads=num_attention_heads[-1],
        )

        # count how many layers upsample the images
        self.num_upsamplers = 0

        # up
        reversed_block_out_channels = list(reversed(block_out_channels))
        reversed_num_attention_heads = list(reversed(num_attention_heads))
        reversed_layers_per_block = list(reversed(layers_per_block))
        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))

        output_channel = reversed_block_out_channels[0]
        for i, up_block_type in enumerate(up_block_types):
            is_final_block = i == len(block_out_channels) - 1

            prev_output_channel = output_channel
            output_channel = reversed_block_out_channels[i]
            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]

            # add upsample block for all BUT final layer
            if not is_final_block:
                add_upsample = True
                self.num_upsamplers += 1
            else:
                add_upsample = False

            up_block = get_up_block(
                up_block_type,
                num_layers=reversed_layers_per_block[i] + 1,
                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
                in_channels=input_channel,
                out_channels=output_channel,
                prev_output_channel=prev_output_channel,
                temb_channels=blocks_time_embed_dim,
                add_upsample=add_upsample,
                resnet_eps=1e-5,
                resolution_idx=i,
                cross_attention_dim=reversed_cross_attention_dim[i],
                num_attention_heads=reversed_num_attention_heads[i],
                resnet_act_fn="silu",
            )
            self.up_blocks.append(up_block)
            prev_output_channel = output_channel

        # out
        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
        self.conv_act = nn.SiLU()

        self.conv_out = nn.Conv2d(
            block_out_channels[0],
            out_channels,
            kernel_size=3,
            padding=1,
        )

    @property
    def attn_processors(self) -> Dict[str, AttentionProcessor]:
        r"""
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        """
        # set recursively
        processors = {}

        def fn_recursive_add_processors(
                name: str,
                module: torch.nn.Module,
                processors: Dict[str, AttentionProcessor],
        ):
            if hasattr(module, "get_processor"):
                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)

            for sub_name, child in module.named_children():
                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

            return processors

        for name, module in self.named_children():
            fn_recursive_add_processors(name, module, processors)

        return processors

    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
        r"""
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        """
        count = len(self.attn_processors.keys())

        if isinstance(processor, dict) and len(processor) != count:
            raise ValueError(
                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
            )

        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
            if hasattr(module, "set_processor"):
                if not isinstance(processor, dict):
                    module.set_processor(processor)
                else:
                    module.set_processor(processor.pop(f"{name}.processor"))

            for sub_name, child in module.named_children():
                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

    def set_default_attn_processor(self):
        """
        Disables custom attention processors and sets the default attention implementation.
        """
        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            processor = AttnProcessor()
        else:
            raise ValueError(
                f"Cannot call `set_default_attn_processor` " \
                f"when attention processors are of type {next(iter(self.attn_processors.values()))}"
            )

        self.set_attn_processor(processor)

    def _set_gradient_checkpointing(self, module, value=False):
        if hasattr(module, "gradient_checkpointing"):
            module.gradient_checkpointing = value

    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
        """
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        """
        if dim not in [0, 1]:
            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")

        # By default chunk size is 1
        chunk_size = chunk_size or 1

        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
            if hasattr(module, "set_chunk_feed_forward"):
                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)

            for child in module.children():
                fn_recursive_feed_forward(child, chunk_size, dim)

        for module in self.children():
            fn_recursive_feed_forward(module, chunk_size, dim)

    def forward(
            self,
            sample: torch.FloatTensor,
            timestep: Union[torch.Tensor, float, int],
            encoder_hidden_states: torch.Tensor,
            added_time_ids: torch.Tensor,
            pose_latents: torch.Tensor = None,
            down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
            mid_block_additional_residual: Optional[torch.Tensor] = None,
            image_only_indicator: bool = False,
            return_dict: bool = True,
    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
        r"""
        The [`UNetSpatioTemporalConditionModel`] forward method.

        Args:
            sample (`torch.FloatTensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.FloatTensor`):
                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
            added_time_ids: (`torch.FloatTensor`):
                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
                embeddings and added to the time embeddings.
            pose_latents: (`torch.FloatTensor`):
                The additional latents for pose sequences.
            image_only_indicator (`bool`, *optional*, defaults to `False`):
                Whether or not training with all images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] 
                instead of a plain tuple.
        Returns:
            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
                If `return_dict` is True, 
                an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, 
                otherwise a `tuple` is returned where the first element is the sample tensor.
        """
        # 1. time
        timesteps = timestep
        if not torch.is_tensor(timesteps):
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            # This would be a good case for the `match` statement (Python 3.10+)
            is_mps = sample.device.type == "mps"
            if isinstance(timestep, float):
                dtype = torch.float32 if is_mps else torch.float64
            else:
                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)

        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        batch_size, num_frames = sample.shape[:2]
        timesteps = timesteps.expand(batch_size)

        t_emb = self.time_proj(timesteps)

        # `Timesteps` does not contain any weights and will always return f32 tensors
        # but time_embedding might actually be running in fp16. so we need to cast here.
        # there might be better ways to encapsulate this.
        t_emb = t_emb.to(dtype=sample.dtype)

        emb = self.time_embedding(t_emb)

        time_embeds = self.add_time_proj(added_time_ids.flatten())
        time_embeds = time_embeds.reshape((batch_size, -1))
        time_embeds = time_embeds.to(emb.dtype)
        aug_emb = self.add_embedding(time_embeds)
        emb = emb + aug_emb

        # Flatten the batch and frames dimensions
        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
        sample = sample.flatten(0, 1)
        # Repeat the embeddings num_video_frames times
        # emb: [batch, channels] -> [batch * frames, channels]
        emb = emb.repeat_interleave(num_frames, dim=0)
        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)

        # 2. pre-process
        sample = self.conv_in(sample)
        if pose_latents is not None:
            sample = sample + pose_latents

        image_only_indicator = torch.ones(batch_size, num_frames, dtype=sample.dtype, device=sample.device) \
            if image_only_indicator else torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)

        down_block_res_samples = (sample,)
        for downsample_block in self.down_blocks:
            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
                    encoder_hidden_states=encoder_hidden_states,
                    image_only_indicator=image_only_indicator,
                )
            else:
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
                    image_only_indicator=image_only_indicator,
                )

            down_block_res_samples += res_samples

            new_down_block_res_samples = ()
            if down_block_additional_residuals is not None:
                for down_block_res_sample, down_block_additional_residual in zip(
                    down_block_res_samples, down_block_additional_residuals
                ):
                    down_block_res_sample = down_block_res_sample + down_block_additional_residual
                    new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)

                down_block_res_samples = new_down_block_res_samples

        # 4. mid
        sample = self.mid_block(
            hidden_states=sample,
            temb=emb,
            encoder_hidden_states=encoder_hidden_states,
            image_only_indicator=image_only_indicator,
        )
        if mid_block_additional_residual is not None:
            sample = sample + mid_block_additional_residual

        # 5. up
        for i, upsample_block in enumerate(self.up_blocks):
            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
                sample = upsample_block(
                    hidden_states=sample,
                    temb=emb,
                    res_hidden_states_tuple=res_samples,
                    encoder_hidden_states=encoder_hidden_states,
                    image_only_indicator=image_only_indicator,
                )
            else:
                sample = upsample_block(
                    hidden_states=sample,
                    temb=emb,
                    res_hidden_states_tuple=res_samples,
                    image_only_indicator=image_only_indicator,
                )

        # 6. post-process
        sample = self.conv_norm_out(sample)
        sample = self.conv_act(sample)
        sample = self.conv_out(sample)

        # 7. Reshape back to original shape
        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])

        if not return_dict:
            return (sample,)

        return UNetSpatioTemporalConditionOutput(sample=sample)


================================================
FILE: mimicmotion/pipelines/pipeline_ctrl.py
================================================
import inspect
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Union

import PIL.Image
import einops
import numpy as np
import torch
import torch.nn as nn


from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion \
    import _resize_with_antialiasing, _append_dims
from diffusers.schedulers import EulerDiscreteScheduler
from diffusers.utils import BaseOutput, logging
from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from mimicmotion.modules.controlnet import ControlNetSVDModel


from ..modules.pose_net import PoseNet
import pdb
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


def _append_dims(x, target_dims):
    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
    dims_to_append = target_dims - x.ndim
    if dims_to_append < 0:
        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
    return x[(...,) + (None,) * dims_to_append]


# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
    batch_size, channels, num_frames, height, width = video.shape
    outputs = []
    for batch_idx in range(batch_size):
        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
        batch_output = processor.postprocess(batch_vid, output_type)

        outputs.append(batch_output)

    if output_type == "np":
        outputs = np.stack(outputs)

    elif output_type == "pt":
        outputs = torch.stack(outputs)

    elif not output_type == "pil":
        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")

    return outputs


@dataclass
class MimicMotionPipelineOutput(BaseOutput):
    r"""
    Output class for mimicmotion pipeline.

    Args:
        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
            num_frames, height, width, num_channels)`.
    """

    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]


class Ctrl_Pipeline(DiffusionPipeline):
    r"""
    Pipeline to generate video from an input image using Stable Video Diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKLTemporalDecoder`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K]
            (https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
        unet ([`UNetSpatioTemporalConditionModel`]):
            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
        scheduler ([`EulerDiscreteScheduler`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images.
        pose_net ([`PoseNet`]):
            A `` to inject pose signals into unet.
    """

    model_cpu_offload_seq = "image_encoder->unet->vae"
    _callback_tensor_inputs = ["latents"]

    def __init__(
        self,
        vae: AutoencoderKLTemporalDecoder,
        image_encoder: CLIPVisionModelWithProjection,
        unet: UNetSpatioTemporalConditionModel,
        controlnet :ControlNetSVDModel,
        scheduler: EulerDiscreteScheduler,
        feature_extractor: CLIPImageProcessor,
        pose_net: PoseNet,
    ):
        super().__init__()

        self.register_modules(
            vae=vae,
            image_encoder=image_encoder,
            unet=unet,
            controlnet=controlnet,
            scheduler=scheduler,
            feature_extractor=feature_extractor,
            pose_net=pose_net,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

    def _encode_image(
        self, 
        image: PipelineImageInput, 
        device: Union[str, torch.device], 
        num_videos_per_prompt: int, 
        do_classifier_free_guidance: bool):
        dtype = next(self.image_encoder.parameters()).dtype

        if not isinstance(image, torch.Tensor):
            image = self.image_processor.pil_to_numpy(image)
            image = self.image_processor.numpy_to_pt(image)

            # We normalize the image before resizing to match with the original implementation.
            # Then we unnormalize it after resizing.
            image = image * 2.0 - 1.0
            image = _resize_with_antialiasing(image, (224, 224))
            image = (image + 1.0) / 2.0

            # Normalize the image with for CLIP input
            image = self.feature_extractor(
                images=image,
                do_normalize=True,
                do_center_crop=False,
                do_resize=False,
                do_rescale=False,
                return_tensors="pt",
            ).pixel_values

        image = image.to(device=device, dtype=dtype)
        image_embeddings = self.image_encoder(image).image_embeds
        image_embeddings = image_embeddings.unsqueeze(1)

        # duplicate image embeddings for each generation per prompt, using mps friendly method
        bs_embed, seq_len, _ = image_embeddings.shape
        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)

        if do_classifier_free_guidance:
            negative_image_embeddings = torch.zeros_like(image_embeddings)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])

        return image_embeddings

    def _encode_pose_image(
        self, 
        pose_image: torch.Tensor, 
        do_classifier_free_guidance: bool,
    ):
        # Get latents_pose
        pose_latents = self.pose_net(pose_image)

        if do_classifier_free_guidance:
            negative_pose_latents = torch.zeros_like(pose_latents)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            pose_latents = torch.cat([negative_pose_latents, pose_latents])

        return pose_latents
    
    def _encode_vae_image(
        self,
        image: torch.Tensor,
        device: Union[str, torch.device],
        num_videos_per_prompt: int,
        do_classifier_free_guidance: bool,
    ):
        image = image.to(device=device, dtype=self.vae.dtype)
        image_latents = self.vae.encode(image).latent_dist.mode()

        if do_classifier_free_guidance:
            negative_image_latents = torch.zeros_like(image_latents)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            image_latents = torch.cat([negative_image_latents, image_latents])

        # duplicate image_latents for each generation per prompt, using mps friendly method
        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)

        return image_latents

    def _get_add_time_ids(
        self,
        fps: int,
        motion_bucket_id: int,
        noise_aug_strength: float,
        dtype: torch.dtype,
        batch_size: int,
        num_videos_per_prompt: int,
        do_classifier_free_guidance: bool,
    ):
        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]

        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features

        if expected_add_embed_dim != passed_add_embed_dim:
            raise ValueError(
                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \
                f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. " \
                f"Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
            )

        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)

        if do_classifier_free_guidance:
            add_time_ids = torch.cat([add_time_ids, add_time_ids])

        return add_time_ids

    def decode_latents(
        self, 
        latents: torch.Tensor, 
        num_frames: int, 
        decode_chunk_size: int = 8):
        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
        latents = latents.flatten(0, 1)

        latents = 1 / self.vae.config.scaling_factor * latents

        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())

        # decode decode_chunk_size frames at a time to avoid OOM
        frames = []
        for i in range(0, latents.shape[0], decode_chunk_size):
            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
            decode_kwargs = {}
            if accepts_num_frames:
                # we only pass num_frames_in if it's expected
                decode_kwargs["num_frames"] = num_frames_in

            frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
            frames.append(frame.cpu())
        frames = torch.cat(frames, dim=0)

        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)

        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        frames = frames.float()
        return frames

    def check_inputs(self, image, height, width):
        if (
                not isinstance(image, torch.Tensor)
                and not isinstance(image, PIL.Image.Image)
                and not isinstance(image, list)
        ):
            raise ValueError(
                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
                f" {type(image)}"
            )

        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

    def prepare_latents(
        self,
        batch_size: int,
        num_frames: int,
        num_channels_latents: int,
        height: int,
        width: int,
        dtype: torch.dtype,
        device: Union[str, torch.device],
        generator: torch.Generator,
        latents: Optional[torch.Tensor] = None,
    ):
        shape = (
            batch_size,
            num_frames,
            num_channels_latents // 2,
            height // self.vae_scale_factor,
            width // self.vae_scale_factor,
        )
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )

        if latents is None:
            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
        else:
            latents = latents.to(device)

        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma
        return latents

    @property
    def guidance_scale(self):
        return self._guidance_scale

    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
    # corresponds to doing no classifier free guidance.
    @property
    def do_classifier_free_guidance(self):
        if isinstance(self.guidance_scale, (int, float)):
            return self.guidance_scale > 1
        return self.guidance_scale.max() > 1

    @property
    def num_timesteps(self):
        return self._num_timesteps

    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]

        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        # check if the scheduler accepts generator
        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
        if accepts_generator:
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

    @torch.no_grad()
    def __call__(
        self,
        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
        image_pose: Union[torch.FloatTensor],
        controlnet_flow: Union[torch.FloatTensor],
        controlnet_image: Union[torch.FloatTensor],
        traj_flow: Union[torch.FloatTensor],
        point_list,
        dift_feats,
        height: int = 576,
        width: int = 1024,
        num_frames: Optional[int] = None,
        tile_size: Optional[int] = 16,
        tile_overlap: Optional[int] = 4,
        num_inference_steps: int = 25,
        min_guidance_scale: float = 1.0,
        max_guidance_scale: float = 3.0,
        fps: int = 7,
        controlnet_cond_scale: float = 1.0,
        motion_bucket_id: int = 127,
        noise_aug_strength: float = 0.02,
        image_only_indicator: bool = False,
        decode_chunk_size: Optional[int] = None,
        num_videos_per_prompt: Optional[int] = 1,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        return_dict: bool = True,
        device: Union[str, torch.device] =None,
    ):
        r"""
        The call function to the pipeline for generation.

        Args:
            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/
                feature_extractor/preprocessor_config.json).
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_frames (`int`, *optional*):
                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` 
                and to 25 for `stable-video-diffusion-img2vid-xt`
            num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. This parameter is modulated by `strength`.
            min_guidance_scale (`float`, *optional*, defaults to 1.0):
                The minimum guidance scale. Used for the classifier free guidance with first frame.
            max_guidance_scale (`float`, *optional*, defaults to 3.0):
                The maximum guidance scale. Used for the classifier free guidance with last frame.
            fps (`int`, *optional*, defaults to 7):
                Frames per second.The rate at which the generated images shall be exported to a video after generation.
                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
            motion_bucket_id (`int`, *optional*, defaults to 127):
                The motion bucket ID. Used as conditioning for the generation. 
                The higher the number the more motion will be in the video.
            noise_aug_strength (`float`, *optional*, defaults to 0.02):
                The amount of noise added to the init image, 
                the higher it is the less the video will look like the init image. Increase it for more motion.
            image_only_indicator (`bool`, *optional*, defaults to False):
                Whether to treat the inputs as batch of images instead of videos.
            decode_chunk_size (`int`, *optional*):
                The number of frames to decode at a time.The higher the chunk size, the higher the temporal consistency
                between frames, but also the higher the memory consumption. 
                By default, the decoder will decode all frames at once for maximal quality. 
                Reduce `decode_chunk_size` to reduce memory usage.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            device:
                On which device the pipeline runs on.

        Returns:
            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, 
                [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.

        Examples:

        ```py
        from diffusers import StableVideoDiffusionPipeline
        from diffusers.utils import load_image, export_to_video

        pipe = StableVideoDiffusionPipeline.from_pretrained(
            "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
        pipe.to("cuda")

        image = load_image(
        "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
        image = image.resize((1024, 576))

        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
        export_to_video(frames, "generated.mp4", fps=7)
        ```
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor

        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames

        # 1. Check inputs. Raise error if not correct
        self.check_inputs(image, height, width)

        # 2. Define call parameters
        if isinstance(image, PIL.Image.Image):
            batch_size = 1
        elif isinstance(image, list):
            batch_size = len(image)
        else:
            batch_size = image.shape[0]
        device = device if device is not None else self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        self._guidance_scale = max_guidance_scale

        # 3. Encode input image
        self.image_encoder.to(device)
        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
        self.image_encoder.cpu()

        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
        # is why it is reduced here.
        fps = fps - 1

        # 4. Encode input image using VAE
        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
        image = image + noise_aug_strength * noise

        self.vae.to(device)
        image_latents = self._encode_vae_image(
            image,
            device=device,
            num_videos_per_prompt=num_videos_per_prompt,
            do_classifier_free_guidance=self.do_classifier_free_guidance,
        )
        image_latents = image_latents.to(image_embeddings.dtype)
        self.vae.cpu()

        # Repeat the image latents for each frame so we can concatenate them with the noise
        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)

        pose_latents = self._encode_pose_image(
            image_pose, do_classifier_free_guidance=self.do_classifier_free_guidance,
        ).to(device)
        pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames)

        # #### get point feature ##################################
        bz, _, w, h = controlnet_image.size()
        # #### get ref point feature ##############################
        ref_point_emb = []
        tgt_point_emb = []
        ref_point = point_list[0]
        assert bz == 1
        rescale_ref_dift = nn.Upsample(size=(h, w), mode='bilinear')(dift_feats[0].squeeze(2))
        for b in range(bz):
            init_embedding = torch.zeros((18, 1280))
            for point_idx, (keypoint, subset) in enumerate(zip(ref_point["candidate"], ref_point["subset"][0])):
                px, py = keypoint[0] * w, keypoint[1] * h
                point_x, point_y = max(min(int(px), w - 1), 0), max(min(int(py), h - 1), 0)
                # point_x, point_y = max(min(w, w * int(keypoint[0]) - 1), 0), max(min(h, h * int(keypoint[1]) - 1), 0)
                if subset!=-1:
                    # point_x, point_y = int(torch.floor(x)), int(torch.floor(y))
                    point_embedding = rescale_ref_dift[b, :, point_y, point_x]
                    init_embedding[point_idx] = point_embedding
            ref_point_emb.append(init_embedding)
        ref_point_emb = torch.stack(ref_point_emb).to(device=self.controlnet.device)

        # 5. Get Added Time IDs
        added_time_ids = self._get_add_time_ids(
            fps,
            motion_bucket_id,
            noise_aug_strength,
            image_embeddings.dtype,
            batch_size,
            num_videos_per_prompt,
            self.do_classifier_free_guidance,
        )
        added_time_ids = added_time_ids.to(device)

        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None)

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_videos_per_prompt,
            tile_size,
            num_channels_latents,
            height,
            width,
            image_embeddings.dtype,
            device,
            generator,
            latents,
        ) # [1, 72, 4, h//8, w//8]
        latents = latents.repeat(1, num_frames // tile_size + 1, 1, 1, 1)[:, :num_frames] # [1, num_frames, 4, h//8, w//8]

        ref_point_emb = ref_point_emb.to(device, latents.dtype)

        controlnet_flow = torch.cat([controlnet_flow] * 2) if self.do_classifier_free_guidance else controlnet_flow
        controlnet_flow = controlnet_flow.to(device, latents.dtype)

        traj_flow = torch.cat([traj_flow] * 2) if self.do_classifier_free_guidance else traj_flow
        traj_flow = traj_flow.to(device, latents.dtype)
        # pdb.set_trace()
        ctrl_image_pose = image_pose.unsqueeze(0)
        neg_ctrl_image_pose = torch.zeros_like(ctrl_image_pose)
        ctrl_image_pose = torch.cat([neg_ctrl_image_pose, ctrl_image_pose]) if self.do_classifier_free_guidance else ctrl_image_pose
        ctrl_image_pose = ctrl_image_pose.to(device, latents.dtype)

        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)

        # 7. Prepare guidance scale
        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
        guidance_scale = guidance_scale.to(device, latents.dtype)
        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
        guidance_scale = _append_dims(guidance_scale, latents.ndim)

        self._guidance_scale = guidance_scale

        # 8. Denoising loop
        self._num_timesteps = len(timesteps)
        indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in
                   range(0, num_frames - tile_size + 1, tile_size - tile_overlap)]
        if indices[-1][-1] < num_frames - 1:
            indices.append([0, *range(num_frames - tile_size + 1, num_frames)])

        self.pose_net.to(device)
        self.unet.to(device)
        self.controlnet.to(device)
        self.controlnet.eval()
        with torch.cuda.device(device):
            torch.cuda.empty_cache()

        with self.progress_bar(total=len(timesteps) * len(indices)) as progress_bar:
            for i, t in enumerate(timesteps):
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # Concatenate image_latents over channels dimension
                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)

                # predict the noise residual
                noise_pred = torch.zeros_like(image_latents)
                noise_pred_cnt = image_latents.new_zeros((num_frames,))
                weight = (torch.arange(tile_size, device=device) + 0.5) * 2. / tile_size
                weight = torch.minimum(weight, 2 - weight)
                for idx in indices:
                    flow_idx = [i-1 for i in idx[1:]]
                    point_input = [point_list[i] for i in idx]
                    down_block_res_samples, mid_block_res_sample, _ = self.controlnet(
                        latent_model_input[:, idx],
                        t,
                        encoder_hidden_states=image_embeddings,
                        controlnet_cond=controlnet_image,
                        controlnet_flow=controlnet_flow[:, flow_idx],
                        traj_flow=traj_flow[:, flow_idx],
                        pose_latents=pose_latents[:, idx].flatten(0, 1),
                        # pose_image=ctrl_image_pose[:, idx],
                        point_list = point_input,
                        dift_feat = dift_feats[1],
                        ref_point_emb = ref_point_emb,
                        added_time_ids=added_time_ids,
                        conditioning_scale=controlnet_cond_scale,
                        guess_mode=False,
                        return_dict=False,
                    )

                    _noise_pred = self.unet(
                        latent_model_input[:, idx],
                        t,
                        encoder_hidden_states=image_embeddings,
                        added_time_ids=added_time_ids,
                        pose_latents=pose_latents[:, idx].flatten(0, 1),
                        down_block_additional_residuals=down_block_res_samples,
                        mid_block_additional_residual=mid_block_res_sample,
                        image_only_indicator=image_only_indicator,
                        return_dict=False,
                    )[0]
                    noise_pred[:, idx] += _noise_pred * weight[:, None, None, None]

                    noise_pred_cnt[idx] += weight
                    progress_bar.update()
                noise_pred.div_(noise_pred_cnt[:, None, None, None])

                # perform guidance
                if self.do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)

                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

                if callback_on_step_end is not None:
                    callback_kwargs = {}
                    for k in callback_on_step_end_tensor_inputs:
                        callback_kwargs[k] = locals()[k]
                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

                    latents = callback_outputs.pop("latents", latents)

        self.pose_net.cpu()
        self.unet.cpu()

        if not output_type == "latent":
            self.vae.decoder.to(device)
            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
        else:
            frames = latents

        self.maybe_free_model_hooks()

        if not return_dict:
            return frames

        return MimicMotionPipelineOutput(frames=frames)


================================================
FILE: mimicmotion/pipelines/pipeline_mimicmotion.py
================================================
import inspect
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Union

import PIL.Image
import einops
import numpy as np
import torch
from diffusers.image_processor import VaeImageProcessor, PipelineImageInput
from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion \
    import _resize_with_antialiasing, _append_dims
from diffusers.schedulers import EulerDiscreteScheduler
from diffusers.utils import BaseOutput, logging
from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

from ..modules.pose_net import PoseNet
import pdb
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


def _append_dims(x, target_dims):
    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
    dims_to_append = target_dims - x.ndim
    if dims_to_append < 0:
        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
    return x[(...,) + (None,) * dims_to_append]


# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
    batch_size, channels, num_frames, height, width = video.shape
    outputs = []
    for batch_idx in range(batch_size):
        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
        batch_output = processor.postprocess(batch_vid, output_type)

        outputs.append(batch_output)

    if output_type == "np":
        outputs = np.stack(outputs)

    elif output_type == "pt":
        outputs = torch.stack(outputs)

    elif not output_type == "pil":
        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")

    return outputs


@dataclass
class MimicMotionPipelineOutput(BaseOutput):
    r"""
    Output class for mimicmotion pipeline.

    Args:
        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
            num_frames, height, width, num_channels)`.
    """

    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]


class MimicMotionPipeline(DiffusionPipeline):
    r"""
    Pipeline to generate video from an input image using Stable Video Diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKLTemporalDecoder`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K]
            (https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
        unet ([`UNetSpatioTemporalConditionModel`]):
            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
        scheduler ([`EulerDiscreteScheduler`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images.
        pose_net ([`PoseNet`]):
            A `` to inject pose signals into unet.
    """

    model_cpu_offload_seq = "image_encoder->unet->vae"
    _callback_tensor_inputs = ["latents"]

    def __init__(
        self,
        vae: AutoencoderKLTemporalDecoder,
        image_encoder: CLIPVisionModelWithProjection,
        unet: UNetSpatioTemporalConditionModel,
        scheduler: EulerDiscreteScheduler,
        feature_extractor: CLIPImageProcessor,
        pose_net: PoseNet,
    ):
        super().__init__()

        self.register_modules(
            vae=vae,
            image_encoder=image_encoder,
            unet=unet,
            scheduler=scheduler,
            feature_extractor=feature_extractor,
            pose_net=pose_net,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

    def _encode_image(
        self, 
        image: PipelineImageInput, 
        device: Union[str, torch.device], 
        num_videos_per_prompt: int, 
        do_classifier_free_guidance: bool):
        dtype = next(self.image_encoder.parameters()).dtype

        if not isinstance(image, torch.Tensor):
            image = self.image_processor.pil_to_numpy(image)
            image = self.image_processor.numpy_to_pt(image)

            # We normalize the image before resizing to match with the original implementation.
            # Then we unnormalize it after resizing.
            image = image * 2.0 - 1.0
            image = _resize_with_antialiasing(image, (224, 224))
            image = (image + 1.0) / 2.0

            # Normalize the image with for CLIP input
            image = self.feature_extractor(
                images=image,
                do_normalize=True,
                do_center_crop=False,
                do_resize=False,
                do_rescale=False,
                return_tensors="pt",
            ).pixel_values

        image = image.to(device=device, dtype=dtype)
        image_embeddings = self.image_encoder(image).image_embeds
        image_embeddings = image_embeddings.unsqueeze(1)

        # duplicate image embeddings for each generation per prompt, using mps friendly method
        bs_embed, seq_len, _ = image_embeddings.shape
        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)

        if do_classifier_free_guidance:
            negative_image_embeddings = torch.zeros_like(image_embeddings)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])

        return image_embeddings

    def _encode_pose_image(
        self, 
        pose_image: torch.Tensor, 
        do_classifier_free_guidance: bool,
    ):
        # Get latents_pose
        pose_latents = self.pose_net(pose_image)

        if do_classifier_free_guidance:
            negative_pose_latents = torch.zeros_like(pose_latents)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            pose_latents = torch.cat([negative_pose_latents, pose_latents])

        return pose_latents
    
    def _encode_vae_image(
        self,
        image: torch.Tensor,
        device: Union[str, torch.device],
        num_videos_per_prompt: int,
        do_classifier_free_guidance: bool,
    ):
        image = image.to(device=device, dtype=self.vae.dtype)
        image_latents = self.vae.encode(image).latent_dist.mode()

        if do_classifier_free_guidance:
            negative_image_latents = torch.zeros_like(image_latents)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            image_latents = torch.cat([negative_image_latents, image_latents])

        # duplicate image_latents for each generation per prompt, using mps friendly method
        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)

        return image_latents

    def _get_add_time_ids(
        self,
        fps: int,
        motion_bucket_id: int,
        noise_aug_strength: float,
        dtype: torch.dtype,
        batch_size: int,
        num_videos_per_prompt: int,
        do_classifier_free_guidance: bool,
    ):
        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]

        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features

        if expected_add_embed_dim != passed_add_embed_dim:
            raise ValueError(
                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, " \
                f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. " \
                f"Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
            )

        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)

        if do_classifier_free_guidance:
            add_time_ids = torch.cat([add_time_ids, add_time_ids])

        return add_time_ids

    def decode_latents(
        self, 
        latents: torch.Tensor, 
        num_frames: int, 
        decode_chunk_size: int = 8):
        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
        latents = latents.flatten(0, 1)

        latents = 1 / self.vae.config.scaling_factor * latents

        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())

        # decode decode_chunk_size frames at a time to avoid OOM
        frames = []
        for i in range(0, latents.shape[0], decode_chunk_size):
            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
            decode_kwargs = {}
            if accepts_num_frames:
                # we only pass num_frames_in if it's expected
                decode_kwargs["num_frames"] = num_frames_in

            frame = self.vae.decode(latents[i: i + decode_chunk_size], **decode_kwargs).sample
            frames.append(frame.cpu())
        frames = torch.cat(frames, dim=0)

        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)

        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
        frames = frames.float()
        return frames

    def check_inputs(self, image, height, width):
        if (
                not isinstance(image, torch.Tensor)
                and not isinstance(image, PIL.Image.Image)
                and not isinstance(image, list)
        ):
            raise ValueError(
                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
                f" {type(image)}"
            )

        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

    def prepare_latents(
        self,
        batch_size: int,
        num_frames: int,
        num_channels_latents: int,
        height: int,
        width: int,
        dtype: torch.dtype,
        device: Union[str, torch.device],
        generator: torch.Generator,
        latents: Optional[torch.Tensor] = None,
    ):
        shape = (
            batch_size,
            num_frames,
            num_channels_latents // 2,
            height // self.vae_scale_factor,
            width // self.vae_scale_factor,
        )
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )

        if latents is None:
            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
        else:
            latents = latents.to(device)

        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma
        return latents

    @property
    def guidance_scale(self):
        return self._guidance_scale

    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
    # corresponds to doing no classifier free guidance.
    @property
    def do_classifier_free_guidance(self):
        if isinstance(self.guidance_scale, (int, float)):
            return self.guidance_scale > 1
        return self.guidance_scale.max() > 1

    @property
    def num_timesteps(self):
        return self._num_timesteps

    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]

        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        # check if the scheduler accepts generator
        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
        if accepts_generator:
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs

    @torch.no_grad()
    def __call__(
        self,
        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
        image_pose: Union[torch.FloatTensor],
        height: int = 576,
        width: int = 1024,
        num_frames: Optional[int] = None,
        tile_size: Optional[int] = 16,
        tile_overlap: Optional[int] = 4,
        num_inference_steps: int = 25,
        min_guidance_scale: float = 1.0,
        max_guidance_scale: float = 3.0,
        fps: int = 7,
        motion_bucket_id: int = 127,
        noise_aug_strength: float = 0.02,
        image_only_indicator: bool = False,
        decode_chunk_size: Optional[int] = None,
        num_videos_per_prompt: Optional[int] = 1,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        return_dict: bool = True,
        device: Union[str, torch.device] =None,
    ):
        r"""
        The call function to the pipeline for generation.

        Args:
            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/
                feature_extractor/preprocessor_config.json).
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_frames (`int`, *optional*):
                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` 
                and to 25 for `stable-video-diffusion-img2vid-xt`
            num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. This parameter is modulated by `strength`.
            min_guidance_scale (`float`, *optional*, defaults to 1.0):
                The minimum guidance scale. Used for the classifier free guidance with first frame.
            max_guidance_scale (`float`, *optional*, defaults to 3.0):
                The maximum guidance scale. Used for the classifier free guidance with last frame.
            fps (`int`, *optional*, defaults to 7):
                Frames per second.The rate at which the generated images shall be exported to a video after generation.
                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
            motion_bucket_id (`int`, *optional*, defaults to 127):
                The motion bucket ID. Used as conditioning for the generation. 
                The higher the number the more motion will be in the video.
            noise_aug_strength (`float`, *optional*, defaults to 0.02):
                The amount of noise added to the init image, 
                the higher it is the less the video will look like the init image. Increase it for more motion.
            image_only_indicator (`bool`, *optional*, defaults to False):
                Whether to treat the inputs as batch of images instead of videos.
            decode_chunk_size (`int`, *optional*):
                The number of frames to decode at a time.The higher the chunk size, the higher the temporal consistency
                between frames, but also the higher the memory consumption. 
                By default, the decoder will decode all frames at once for maximal quality. 
                Reduce `decode_chunk_size` to reduce memory usage.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            device:
                On which device the pipeline runs on.

        Returns:
            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, 
                [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.

        Examples:

        ```py
        from diffusers import StableVideoDiffusionPipeline
        from diffusers.utils import load_image, export_to_video

        pipe = StableVideoDiffusionPipeline.from_pretrained(
            "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
        pipe.to("cuda")

        image = load_image(
        "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
        image = image.resize((1024, 576))

        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
        export_to_video(frames, "generated.mp4", fps=7)
        ```
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor

        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames

        # 1. Check inputs. Raise error if not correct
        self.check_inputs(image, height, width)

        # 2. Define call parameters
        if isinstance(image, PIL.Image.Image):
            batch_size = 1
        elif isinstance(image, list):
            batch_size = len(image)
        else:
            batch_size = image.shape[0]
        device = device if device is not None else self._execution_device
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        self._guidance_scale = max_guidance_scale

        # 3. Encode input image
        self.image_encoder.to(device)
        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
        self.image_encoder.cpu()

        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
        # is why it is reduced here.
        fps = fps - 1

        # 4. Encode input image using VAE
        # pdb.set_trace()
        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
        # PIL.Image.fromarray((image[0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)).save("vis_img/test_flows.png")
        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
        image = image + noise_aug_strength * noise

        self.vae.to(device)
        image_latents = self._encode_vae_image(
            image,
            device=device,
            num_videos_per_prompt=num_videos_per_prompt,
            do_classifier_free_guidance=self.do_classifier_free_guidance,
        )
        image_latents = image_latents.to(image_embeddings.dtype)
        self.vae.cpu()

        # Repeat the image latents for each frame so we can concatenate them with the noise
        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)

        pose_latents = self._encode_pose_image(
            image_pose, do_classifier_free_guidance=self.do_classifier_free_guidance,
        ).to(device)
        pose_latents = einops.rearrange(pose_latents, '(b f) c h w -> b f c h w', f=num_frames)

        # 5. Get Added Time IDs
        added_time_ids = self._get_add_time_ids(
            fps,
            motion_bucket_id,
            noise_aug_strength,
            image_embeddings.dtype,
            batch_size,
            num_videos_per_prompt,
            self.do_classifier_free_guidance,
        )
        added_time_ids = added_time_ids.to(device)

        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None)

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_videos_per_prompt,
            tile_size,
            num_channels_latents,
            height,
            width,
            image_embeddings.dtype,
            device,
            generator,
            latents,
        ) # [1, 72, 4, h//8, w//8]
        latents = latents.repeat(1, num_frames // tile_size + 1, 1, 1, 1)[:, :num_frames] # [1, num_frames, 4, h//8, w//8]

        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)

        # 7. Prepare guidance scale
        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
        guidance_scale = guidance_scale.to(device, latents.dtype)
        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
        guidance_scale = _append_dims(guidance_scale, latents.ndim)

        self._guidance_scale = guidance_scale

        # 8. Denoising loop
        self._num_timesteps = len(timesteps)
        indices = [[0, *range(i + 1, min(i + tile_size, num_frames))] for i in
                   range(0, num_frames - tile_size + 1, tile_size - tile_overlap)]
        if indices[-1][-1] < num_frames - 1:
            indices.append([0, *range(num_frames - tile_size + 1, num_frames)])

        self.pose_net.to(device)
        self.unet.to(device)

        with torch.cuda.device(device):
            torch.cuda.empty_cache()

        with self.progress_bar(total=len(timesteps) * len(indices)) as progress_bar:
            for i, t in enumerate(timesteps):
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # Concatenate image_latents over channels dimension
                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)

                # predict the noise residual
                noise_pred = torch.zeros_like(image_latents)
                noise_pred_cnt = image_latents.new_zeros((num_frames,))
                weight = (torch.arange(tile_size, device=device) + 0.5) * 2. / tile_size
                weight = torch.minimum(weight, 2 - weight)
                for idx in indices:

                    _noise_pred = self.unet(
                        latent_model_input[:, idx],
                        t,
                        encoder_hidden_states=image_embeddings,
                        added_time_ids=added_time_ids,
                        pose_latents=pose_latents[:, idx].flatten(0, 1),
                        image_only_indicator=image_only_indicator,
                        return_dict=False,
                    )[0]
                    noise_pred[:, idx] += _noise_pred * weight[:, None, None, None]

                    # # classification-free inference
                    # pose_latents = self.pose_net(image_pose[idx].to(device))
                    # _noise_pred = self.unet(
                    #     latent_model_input[:1, idx],
                    #     t,
                    #     encoder_hidden_states=image_embeddings[:1],
                    #     added_time_ids=added_time_ids[:1],
                    #     pose_latents=None,
                    #     image_only_indicator=image_only_indicator,
                    #     return_dict=False,
                    # )[0]
                    # noise_pred[:1, idx] += _noise_pred * weight[:, None, None, None]

                    # # normal inference
                    # _noise_pred = self.unet(
                    #     latent_model_input[1:, idx],
                    #     t,
                    #     encoder_hidden_states=image_embeddings[1:],
                    #     added_time_ids=added_time_ids[1:],
                    #     pose_latents=pose_latents,
                    #     image_only_indicator=image_only_indicator,
                    #     return_dict=False,
                    # )[0]
                    # noise_pred[1:, idx] += _noise_pred * weight[:, None, None, None]

                    noise_pred_cnt[idx] += weight
                    progress_bar.update()
                noise_pred.div_(noise_pred_cnt[:, None, None, None])

                # perform guidance
                if self.do_classifier_free_guidance:
                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)

                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

                if callback_on_step_end is not None:
                    callback_kwargs = {}
                    for k in callback_on_step_end_tensor_inputs:
                        callback_kwargs[k] = locals()[k]
                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

                    latents = callback_outputs.pop("latents", latents)

        self.pose_net.cpu()
        self.unet.cpu()

        if not output_type == "latent":
            self.vae.decoder.to(device)
            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
        else:
            frames = latents

        self.maybe_free_model_hooks()

        if not return_dict:
            return frames

        return MimicMotionPipelineOutput(frames=frames)


================================================
FILE: mimicmotion/utils/__init__.py
================================================


================================================
FILE: mimicmotion/utils/dift_utils.py
================================================
import gc
from typing import Any, Dict, Optional, Union

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from diffusers import DDIMScheduler, StableDiffusionPipeline
from diffusers.models.unet_2d_condition import UNet2DConditionModel
from PIL import Image, ImageDraw
import pdb

class MyUNet2DConditionModel(UNet2DConditionModel):
    def forward(
        self,
        sample: torch.FloatTensor,
        timestep: Union[torch.Tensor, float, int],
        up_ft_indices,
        encoder_hidden_states: torch.Tensor,
        class_labels: Optional[torch.Tensor] = None,
        timestep_cond: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None
    ):
        r"""
        Args:
            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
                `self.processor` in
                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
        """
        # By default samples have to be AT least a multiple of the overall upsampling factor.
        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
        # However, the upsampling interpolation output size can be forced to fit any upsampling size
        # on the fly if necessary.
        default_overall_up_factor = 2**self.num_upsamplers

        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
        forward_upsample_size = False
        upsample_size = None

        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
            # logger.info("Forward upsample size to force interpolation output size.")
            forward_upsample_size = True

        # prepare attention_mask
        if attention_mask is not None:
            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
            attention_mask = attention_mask.unsqueeze(1)

        # 0. center input if necessary
        if self.config.center_input_sample:
            sample = 2 * sample - 1.0

        # 1. time
        timesteps = timestep
        if not torch.is_tensor(timesteps):
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            # This would be a good case for the `match` statement (Python 3.10+)
            is_mps = sample.device.type == 'mps'
            if isinstance(timestep, float):
                dtype = torch.float32 if is_mps else torch.float64
            else:
                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)

        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        timesteps = timesteps.expand(sample.shape[0])

        t_emb = self.time_proj(timesteps)

        # timesteps does not contain any weights and will always return f32 tensors
        # but time_embedding might actually be running in fp16. so we need to cast here.
        # there might be better ways to encapsulate this.
        t_emb = t_emb.to(dtype=self.dtype)

        emb = self.time_embedding(t_emb, timestep_cond)

        if self.class_embedding is not None:
            if class_labels is None:
                raise ValueError('class_labels should be provided when num_class_embeds > 0')

            if self.config.class_embed_type == 'timestep':
                class_labels = self.time_proj(class_labels)

            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
            emb = emb + class_emb

        # 2. pre-process
        sample = self.conv_in(sample)

        # 3. down
        down_block_res_samples = (sample,)
        for downsample_block in self.down_blocks:
            if hasattr(downsample_block, 'has_cross_attention') and downsample_block.has_cross_attention:
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
                    encoder_hidden_states=encoder_hidden_states,
                    attention_mask=attention_mask,
                    cross_attention_kwargs=cross_attention_kwargs,
                )
            else:
                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)

            down_block_res_samples += res_samples

        # 4. mid
        if self.mid_block is not None:
            sample = self.mid_block(
                sample,
                emb,
                encoder_hidden_states=encoder_hidden_states,
                attention_mask=attention_mask,
                cross_attention_kwargs=cross_attention_kwargs,
            )

        # 5. up
        up_ft = {}

        for i, upsample_block in enumerate(self.up_blocks):

            if i > np.max(up_ft_indices):
                break

            is_final_block = i == len(self.up_blocks) - 1

            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

            # if we have not reached the final block and need to forward the
            # upsample size, we do it here
            if not is_final_block and forward_upsample_size:
                upsample_size = down_block_res_samples[-1].shape[2:]

            if hasattr(upsample_block, 'has_cross_attention') and upsample_block.has_cross_attention:
                sample = upsample_block(
                    hidden_states=sample,
                    temb=emb,
                    res_hidden_states_tuple=res_samples,
                    encoder_hidden_states=encoder_hidden_states,
                    cross_attention_kwargs=cross_attention_kwargs,
                    upsample_size=upsample_size,
                    attention_mask=attention_mask,
                )
            else:
                sample = upsample_block(
                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
                )

            if i in up_ft_indices:
                up_ft[i] = sample.detach()

        output = {}
        output['up_ft'] = up_ft

        return output


class OneStepSDPipeline(StableDiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
        img_tensor,
        t,
        up_ft_indices,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None
    ):

        device = self._execution_device
        latents = self.vae.encode(img_tensor).latent_dist.sample() * self.vae.config.scaling_factor
        t = torch.tensor(t, dtype=torch.long, device=device)
        noise = torch.randn_like(latents).to(device)
        latents_noisy = self.scheduler.add_noise(latents, noise, t)
        unet_output = self.unet(latents_noisy, t, up_ft_indices, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs)
        return unet_output


class SDFeaturizer:
    def __init__(self, sd_id='pretrained_models/stable-diffusion-v1-4', weight_dtype=torch.float32):
        unet = MyUNet2DConditionModel.from_pretrained(sd_id, subfolder='unet', variant="fp16").to(weight_dtype)
        onestep_pipe = OneStepSDPipeline.from_pretrained(sd_id, unet=unet, safety_checker=None, variant="fp16").to(weight_dtype)
        onestep_pipe.vae.decoder = None
        onestep_pipe.scheduler = DDIMScheduler.from_pretrained(sd_id, subfolder='scheduler')
        gc.collect()
        onestep_pipe = onestep_pipe.to('cuda')
        onestep_pipe.enable_attention_slicing()
        self.pipe = onestep_pipe

        null_prompt = ''
        self.null_prompt_embeds = self.pipe.encode_prompt(
            prompt=null_prompt,
            device='cuda',
            num_images_per_prompt=1,
            do_classifier_free_guidance=False)[0] # [1, 77, dim]
        
    @torch.no_grad()
    def forward(self,
                img_tensor,
                prompt,
                t=[261,0],
                up_ft_index=[1,2],
                ensemble_size=8):
        '''
        Args:
            img_tensor: should be a single torch tensor in the shape of [1, C, H, W] or [C, H, W]
            prompt: the prompt to use, a string
            t: the time step to use, should be an int in the range of [0, 1000]
            up_ft_index: which upsampling block of the U-Net to extract feature, you can choose [0, 1, 2, 3]
            ensemble_size: the number of repeated images used in the batch to extract features
        Return:
            unet_ft: a torch tensor in the shape of [1, c, h, w]
        '''
        img_tensor = img_tensor.repeat(ensemble_size, 1, 1, 1).cuda() # ensem, c, h, w
        prompt_embeds = self.pipe.encode_prompt(
            prompt=prompt,
            device='cuda',
            num_images_per_prompt=1,
            do_classifier_free_guidance=False)[0] # [1, 77, dim]
        prompt_embeds = prompt_embeds.repeat(ensemble_size, 1, 1)

        unet_ft1 = self.pipe(
            img_tensor=img_tensor,
            t=t[0],
            up_ft_indices=[up_ft_index[0]],
            prompt_embeds=prompt_embeds)

        unet_ft1 = unet_ft1['up_ft'][up_ft_index[0]] # ensem, c, h, w
        unet_ft1 = unet_ft1.mean(0, keepdim=True) # 1,c,h,w

        null_prompt_embeds = self.null_prompt_embeds.repeat(ensemble_size, 1, 1)
        unet_ft2 = self.pipe(
            img_tensor=img_tensor,
            t=t[1],
            up_ft_indices=[up_ft_index[1]],
            prompt_embeds=null_prompt_embeds)
        
        unet_ft2 = unet_ft2['up_ft'][up_ft_index[1]] # ensem, c, h, w
        unet_ft2 = unet_ft2.mean(0, keepdim=True) # 1,c,h,w

        return unet_ft1, unet_ft2


class DIFT_Demo:
    def __init__(self, source_img, source_dift, source_img_size):
        self.source_dift = source_dift  # NCHW # torch.Size([1, 1280, 28, 48])
        self.source_img = source_img
        self.source_img_size = source_img_size

    @torch.no_grad()
    def query(self, target_img, target_dift, target_img_size, query_point, target_point, visualize=False):
        num_channel = self.source_dift.size(1)
        cos = nn.CosineSimilarity(dim=1)
        source_x, source_y = int(np.round(query_point[1])), int(np.round(query_point[0]))

        src_ft = self.source_dift
        src_ft = nn.Upsample(size=self.source_img_size, mode='bilinear')(src_ft)
        src_vec = src_ft[0, :, source_y, source_x].view(1, num_channel, 1, 1)  # 1, C, 1, 1

        tgt_ft = nn.Upsample(size=target_img_size, mode='bilinear')(target_dift)
        cos_map = cos(src_vec, tgt_ft).cpu().numpy()  # N, H, W  (1, 448, 768)

        max_yx = np.unravel_index(cos_map[0].argmax(), cos_map[0].shape)
        target_x, target_y = int(np.round(target_point[1])), int(np.round(target_point[0]))

        if visualize:
            heatmap = cos_map[0]
            heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap))

            cmap = plt.get_cmap('viridis')
            heatmap_color = (cmap(heatmap) * 255)[..., :3].astype(np.uint8)

            alpha, radius, color = 0.5, 3, (0, 255, 0)
            blended_image = Image.blend(target_img, Image.fromarray(heatmap_color), alpha=alpha)
            draw = ImageDraw.Draw(blended_image)
            draw.ellipse((max_yx[1] - radius, max_yx[0] - radius, max_yx[1] + radius, max_yx[0] + radius), fill=color)
            draw.ellipse((target_x - radius, target_y - radius, target_x + radius, target_y + radius), fill=color)
        else:
            blended_image = None
        dift_feat, confidence = tgt_ft[0, :, target_y, target_x], cos_map[0, target_y, target_x]
        return dift_feat, confidence, blended_image


================================================
FILE: mimicmotion/utils/flow_utils.py
================================================
from PIL import Image, ImageOps
import scipy.ndimage as ndimage
import cv2
import random
import numpy as np
from scipy.ndimage.filters import maximum_filter
from scipy import signal
cv2.ocl.setUseOpenCL(False)
import torch
import torch.nn as nn
class ForwardWarp(nn.Module):
    """docstring for WarpLayer"""

    def __init__(
        self,
    ):
        super(ForwardWarp, self).__init__()

    def forward(self, img, flo):
        """
        -img: image (N, C, H, W)
        -flo: optical flow (N, 2, H, W)
        elements of flo is in [0, H] and [0, W] for dx, dy

        """

        # (x1, y1)		(x1, y2)
        # +---------------+
        # |				  |
        # |	o(x, y) 	  |
        # |				  |
        # |				  |
        # |				  |
        # |				  |
        # +---------------+
        # (x2, y1)		(x2, y2)

        N, C, _, _ = img.size()

        # translate start-point optical flow to end-point optical flow
        y = flo[:, 0:1:, :]
        x = flo[:, 1:2, :, :]

        x = x.repeat(1, C, 1, 1)
        y = y.repeat(1, C, 1, 1)

        # Four point of square (x1, y1), (x1, y2), (x2, y1), (y2, y2)
        x1 = torch.floor(x)
        x2 = x1 + 1
        y1 = torch.floor(y)
        y2 = y1 + 1

        # firstly, get gaussian weights
        w11, w12, w21, w22 = self.get_gaussian_weights(x, y, x1, x2, y1, y2)

        # secondly, sample each weighted corner
        img11, o11 = self.sample_one(img, x1, y1, w11)
        img12, o12 = self.sample_one(img, x1, y2, w12)
        img21, o21 = self.sample_one(img, x2, y1, w21)
        img22, o22 = self.sample_one(img, x2, y2, w22)

        imgw = img11 + img12 + img21 + img22
        o = o11 + o12 + o21 + o22

        return imgw, o

    def get_gaussian_weights(self, x, y, x1, x2, y1, y2):
        w11 = torch.exp(-((x - x1) ** 2 + (y - y1) ** 2))
        w12 = torch.exp(-((x - x1) ** 2 + (y - y2) ** 2))
        w21 = torch.exp(-((x - x2) ** 2 + (y - y1) ** 2))
        w22 = torch.exp(-((x - x2) ** 2 + (y - y2) ** 2))

        return w11, w12, w21, w22

    def sample_one(self, img, shiftx, shifty, weight):
        """
        Input:
                -img (N, C, H, W)
                -shiftx, shifty (N, c, H, W)
        """

        N, C, H, W = img.size()

        # flatten all (all restored as Tensors)
        flat_shiftx = shiftx.view(-1)
        flat_shifty = shifty.view(-1)
        flat_basex = (
            torch.arange(0, H, requires_grad=False)
            .view(-1, 1)[None, None]
            .cuda()
            .long()
            .repeat(N, C, 1, W)
            .view(-1)
        )
        flat_basey = (
            torch.arange(0, W, requires_grad=False)
            .view(1, -1)[None, None]
            .cuda()
            .long()
            .repeat(N, C, H, 1)
            .view(-1)
        )
        flat_weight = weight.view(-1)
        flat_img = img.view(-1)

        # The corresponding positions in I1
        idxn = (
            torch.arange(0, N, requires_grad=False)
            .view(N, 1, 1, 1)
            .long()
            .cuda()
            .repeat(1, C, H, W)
            .view(-1)
        )
        idxc = (
            torch.arange(0, C, requires_grad=False)
            .view(1, C, 1, 1)
            .long()
            .cuda()
            .repeat(N, 1, H, W)
            .view(-1)
        )
        # ttype = flat_basex.type()
        idxx = flat_shiftx.long() + flat_basex
        idxy = flat_shifty.long() + flat_basey

        # recording the inside part the shifted
        mask = idxx.ge(0) & idxx.lt(H) & idxy.ge(0) & idxy.lt(W)

        # Mask off points out of boundaries
        ids = idxn * C * H * W + idxc * H * W + idxx * W + idxy
        ids_mask = torch.masked_select(ids, mask).clone().cuda()

        # (zero part - gt) -> difference
        # difference back propagate -> No influence! Whether we do need mask? mask?
        # put (add) them together
        # Note here! accmulate fla must be true for proper bp
        img_warp = torch.zeros(
            [
                N * C * H * W,
            ]
        ).cuda()
        img_warp.put_(
            ids_mask, torch.masked_select(flat_img * flat_weight, mask), accumulate=True
        )

        one_warp = torch.zeros(
            [
                N * C * H * W,
            ]
        ).cuda()
        one_warp.put_(ids_mask, torch.masked_select(flat_weight, mask), accumulate=True)

        return img_warp.view(N, C, H, W), one_warp.view(N, C, H, W)

def get_edge(data, blur=False):
    if blur:
        data = cv2.GaussianBlur(data, (3, 3), 1.)
    sobel = np.array([[1,0,-1],[2,0,-2],[1,0,-1]]).astype(np.float32)
    ch_edges = []
    for k in range(data.shape[2]):
        edgex = signal.convolve2d(data[:,:,k], sobel, boundary='symm', mode='same')
        edgey = signal.convolve2d(data[:,:,k], sobel.T, boundary='symm', mode='same')
        ch_edges.append(np.sqrt(edgex**2 + edgey**2))
    return sum(ch_edges)

def get_max(score, bbox):
    u = max(0, bbox[0])
    d = min(score.shape[0], bbox[1])
    l = max(0, bbox[2])
    r = min(score.shape[1], bbox[3])
    return score[u:d,l:r].max()

def nms(score, ks):
    assert ks % 2 == 1
    ret_score = score.copy()
    maxpool = maximum_filter(score, footprint=np.ones((ks, ks)))
    ret_score[score < maxpool] = 0.
    return ret_score

def image_flow_crop(img1, img2, flow, crop_size, phase):
    assert len(crop_size) == 2
    pad_h = max(crop_size[0] - img1.height, 0)
    pad_w = max(crop_size[1] - img1.width, 0)
    pad_h_half = int(pad_h / 2)
    pad_w_half = int(pad_w / 2)
    if pad_h > 0 or pad_w > 0:
        flow_expand = np.zeros((img1.height + pad_h, img1.width + pad_w, 2), dtype=np.float32)
        flow_expand[pad_h_half:pad_h_half+img1.height, pad_w_half:pad_w_half+img1.width, :] = flow
        flow = flow_expand
        border = (pad_w_half, pad_h_half, pad_w - pad_w_half, pad_h - pad_h_half)
        img1 = ImageOps.expand(img1, border=border, fill=(0,0,0))
        img2 = ImageOps.expand(img2, border=border, fill=(0,0,0))
    if phase == 'train':
        hoff = int(np.random.rand() * (img1.height - crop_size[0]))
        woff = int(np.random.rand() * (img1.width - crop_size[1]))
    else:
        hoff = (img1.height - crop_size[0]) // 2
        woff = (img1.width - crop_size[1]) // 2

    img1 = img1.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0]))
    img2 = img2.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0]))
    flow = flow[hoff:hoff+crop_size[0], woff:woff+crop_size[1], :]
    offset = (hoff, woff)
    return img1, img2, flow, offset

def image_crop(img, crop_size):
    pad_h = max(crop_size[0] - img.height, 0)
    pad_w = max(crop_size[1] - img.width, 0)
    pad_h_half = int(pad_h / 2)
    pad_w_half = int(pad_w / 2)
    if pad_h > 0 or pad_w > 0:
        border = (pad_w_half, pad_h_half, pad_w - pad_w_half, pad_h - pad_h_half)
        img = ImageOps.expand(img, border=border, fill=(0,0,0))
    hoff = (img.height - crop_size[0]) // 2
    woff = (img.width - crop_size[1]) // 2
    return img.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0])), (pad_w_half, pad_h_half)

def image_flow_resize(img1, img2, flow, short_size=None, long_size=None):
    assert (short_size is None) ^ (long_size is None)
    w, h = img1.width, img1.height
    if short_size is not None:
        if w < h:
            neww = short_size
            newh = int(short_size / float(w) * h)
        else:
            neww = int(short_size / float(h) * w)
            newh = short_size
    else:
        if w < h:
            neww = int(long_size / float(h) * w)
            newh = long_size
        else:
            neww = long_size
            newh = int(long_size / float(w) * h)
    img1 = img1.resize((neww, newh), Image.BICUBIC)
    img2 = img2.resize((neww, newh), Image.BICUBIC)
    ratio = float(newh) / h
    flow = cv2.resize(flow.copy(), (neww, newh), interpolation=cv2.INTER_LINEAR) * ratio
    return img1, img2, flow, ratio

def image_resize(img, short_size=None, long_size=None):
    assert (short_size is None) ^ (long_size is None)
    w, h = img.width, img.height
    if short_size is not None:
        if w < h:
            neww = short_size
            newh = int(short_size / float(w) * h)
        else:
            neww = int(short_size / float(h) * w)
            newh = short_size
    else:
        if w < h:
            neww = int(long_size / float(h) * w)
            newh = long_size
        else:
            neww = long_size
            newh = int(long_size / float(w) * h)
    img = img.resize((neww, newh), Image.BICUBIC)
    return img, [w, h]


def image_pose_crop(img, posemap, crop_size, scale):
    assert len(crop_size) == 2
    assert crop_size[0] <= img.height
    assert crop_size[1] <= img.width
    hoff = (img.height - crop_size[0]) // 2
    woff = (img.width - crop_size[1]) // 2
    img = img.crop((woff, hoff, woff+crop_size[1], hoff+crop_size[0]))
    posemap = posemap[hoff//scale:hoff//scale+crop_size[0]//scale, woff//scale:woff//scale+crop_size[1]//scale,:]
    return img, posemap

def neighbor_elim(ph, pw, d):
    valid = np.ones((len(ph))).astype(np.int64)
    h_dist = np.fabs(np.tile(ph[:,np.newaxis], [1,len(ph)]) - np.tile(ph.T[np.newaxis,:], [len(ph),1]))
    w_dist = np.fabs(np.tile(pw[:,np.newaxis], [1,len(pw)]) - np.tile(pw.T[np.newaxis,:], [len(pw),1]))
    idx1, idx2 = np.where((h_dist < d) & (w_dist < d))
    for i,j in zip(idx1, idx2):
        if valid[i] and valid[j] and i != j:
            if np.random.rand() > 0.5:
                valid[i] = 0
            else:
                valid[j] = 0
    valid_idx = np.where(valid==1)
    return ph[valid_idx], pw[valid_idx]

def remove_border(mask):
        mask[0,:] = 0
        mask[:,0] = 0
        mask[mask.shape[0]-1,:] = 0
        mask[:,mask.shape[1]-1] = 0

def flow_sampler(flow, strategy=['grid'], bg_ratio=1./6400, nms_ks=15, max_num_guide=-1, guidepoint=None):
    assert bg_ratio >= 0 and bg_ratio <= 1, "sampling ratio must be in (0, 1]"
    for s in strategy:
        assert s in ['grid', 'uniform', 'gradnms', 'watershed', 'single', 'full', 'specified'], "No such strategy: {}".format(s)
    h = flow.shape[0]
    w = flow.shape[1]
    ds = max(1, max(h, w) // 400) # reduce computation

    if 'full' in strategy:
        sparse = flow.copy()
        mask = np.ones(flow.shape, dtype=np.int)
        return sparse, mask

    pts_h = []
    pts_w = []
    if 'grid' in strategy:
        stride = int(np.sqrt(1./bg_ratio))
        mesh_start_h = int((h - h // stride * stride) / 2)
        mesh_start_w = int((w - w // stride * stride) / 2)
        mesh = np.meshgrid(np.arange(mesh_start_h, h, stride), np.arange(mesh_start_w, w, stride))
        pts_h.append(mesh[0].flat)
        pts_w.append(mesh[1].flat)
    if 'uniform' in strategy:
        pts_h.append(np.random.randint(0, h, int(bg_ratio * h * w)))
        pts_w.append(np.random.randint(0, w, int(bg_ratio * h * w)))
    if "gradnms" in strategy:
        ks = w // ds // 20
        edge = get_edge(flow[::ds,::ds,:])
        kernel = np.ones((ks, ks), dtype=np.float32) / (ks * ks)
        subkernel = np.ones((ks//2, ks//2), dtype=np.float32) / (ks//2 * ks//2)
        score = signal.convolve2d(edge, kernel, boundary='symm', mode='same')
        subscore = signal.convolve2d(edge, subkernel, boundary='symm', mode='same')
        score = score / score.max() - subscore / subscore.max()
        nms_res = nms(score, nms_ks)
        pth, ptw = np.where(nms_res > 0.1)
        pts_h.append(pth * ds)
        pts_w.append(ptw * ds)
    if "watershed" in strategy:
        edge = get_edge(flow[::ds,::ds,:])
        edge /= max(edge.max(), 0.01)
        edge = (edge > 0.1).astype(np.float32)
        watershed = ndimage.distance_transform_edt(1-edge)
        nms_res = nms(watershed, nms_ks)
        remove_border(nms_res)
        pth, ptw = np.where(nms_res > 0)
        pth, ptw = neighbor_elim(pth, ptw, (nms_ks-1)/2)
        pts_h.append(pth * ds)
        pts_w.append(ptw * ds)
    if "single" in strategy:
        pth, ptw = np.where((flow[:,:,0] != 0) | (flow[:,:,1] != 0))
        randidx = np.random.randint(len(pth))
        pts_h.append(pth[randidx:randidx+1])
        pts_w.append(ptw[randidx:randidx+1])
    if 'specified' in strategy:
        assert guidepoint is not None, "if using \"specified\", switch \"with_info\" on."
        pts_h.append(guidepoint[:,1])
        pts_w.append(guidepoint[:,0])

    pts_h = np.concatenate(pts_h)
    pts_w = np.concatenate(pts_w)

    if max_num_guide == -1:
        max_num_guide = np.inf

    randsel = np.random.permutation(len(pts_h))[:len(pts_h)]
    selidx = randsel[np.arange(min(max_num_guide, len(randsel)))]
    pts_h = pts_h[selidx]
    pts_w = pts_w[selidx]

    sparse = np.zeros(flow.shape, dtype=flow.dtype)
    mask = np.zeros(flow.shape, dtype=np.int64)
    
    sparse[:, :, 0][(pts_h, pts_w)] = flow[:, :, 0][(pts_h, pts_w)]
    sparse[:, :, 1][(pts_h, pts_w)] = flow[:, :, 1][(pts_h, pts_w)]
    
    mask[:,:,0][(pts_h, pts_w)] = 1
    mask[:,:,1][(pts_h, pts_w)] = 1
    return sparse, mask

def image_flow_aug(img1, img2, flow, flip_horizon=True):
    if flip_horizon:
        if random.random() < 0.5:
            img1 = img1.transpose(Image.FLIP_LEFT_RIGHT)
            img2 = img2.transpose(Image.FLIP_LEFT_RIGHT)
            flow = flow[:,::-1,:].copy()
            flow[:,:,0] = -flow[:,:,0]
    return img1, img2, flow

def flow_aug(flow, reverse=True, scale=True, rotate=True):
    if reverse:
        if random.random() < 0.5:
            flow = -flow
    if scale:
        rand_scale = random.uniform(0.5, 2.0)
        flow = flow * rand_scale
    if rotate and random.random() < 0.5:
        lengh = np.sqrt(np.square(flow[:,:,0]) + np.square(flow[:,:,1]))
        alpha = np.arctan(flow[:,:,1] / flow[:,:,0])
        theta = random.uniform(0, np.pi*2)
        flow[:,:,0] = lengh * np.cos(alpha + theta)
        flow[:,:,1] = lengh * np.sin(alpha + theta)
    return flow

def draw_gaussian(img, pt, sigma, type='Gaussian'):
    # Check that any part of the gaussian is in-bounds
    ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
    br = [int(pt[0] + 3 * sigma + 1), int(pt[1] + 3 * sigma + 1)]
    if (ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or
            br[0] < 0 or br[1] < 0):
        # If not, just return the image as is
        return img

    # Generate gaussian
    size = 6 * sigma + 1
    x = np.arange(0, size, 1, float)
    y = x[:, np.newaxis]
    x0 = y0 = size // 2
    # The gaussian is not normalized, we want the center value to equal 1
    if type == 'Gaussian':
        g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
    elif type == 'Cauchy':
        g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma ** 2) ** 1.5)

    # Usable gaussian range
    g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
    g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
    # Image range
    img_x = max(0, ul[0]), min(br[0], img.shape[1])
    img_y = max(0, ul[1]), min(br[1], img.shape[0])

    img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
    return img


================================================
FILE: mimicmotion/utils/geglu_patch.py
================================================
import diffusers.models.activations


def patch_geglu_inplace():
    """Patch GEGLU with inplace multiplication to save GPU memory."""
    def forward(self, hidden_states):
        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
        return hidden_states.mul_(self.gelu(gate))
    diffusers.models.activations.GEGLU.forward = forward


================================================
FILE: mimicmotion/utils/loader.py
================================================
import logging

import torch
import torch.utils.checkpoint
from diffusers.models import AutoencoderKLTemporalDecoder
from diffusers.schedulers import EulerDiscreteScheduler
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

from ..modules.unet import UNetSpatioTemporalConditionModel
from ..modules.pose_net import PoseNet
from ..modules.controlnet import ControlNetSVDModel
from ..pipelines.pipeline_mimicmotion import MimicMotionPipeline
from ..pipelines.pipeline_ctrl import Ctrl_Pipeline


logger = logging.getLogger(__name__)

class MimicMotionModel(torch.nn.Module):
    def __init__(self, base_model_path):
        """construnct base model components and load pretrained svd model except pose-net
        Args:
            base_model_path (str): pretrained svd model path
        """
        super().__init__()
        self.unet = UNetSpatioTemporalConditionModel.from_config(
            UNetSpatioTemporalConditionModel.load_config(base_model_path, subfolder="unet"))
        self.vae = AutoencoderKLTemporalDecoder.from_pretrained(
            base_model_path, subfolder="vae", torch_dtype=torch.float16, variant="fp16")
        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
            base_model_path, subfolder="image_encoder", torch_dtype=torch.float16, variant="fp16")
        self.noise_scheduler = EulerDiscreteScheduler.from_pretrained(
            base_model_path, subfolder="scheduler")
        self.feature_extractor = CLIPImageProcessor.from_pretrained(
            base_model_path, subfolder="feature_extractor")
        # pose_net
        self.pose_net = PoseNet(noise_latent_channels=self.unet.config.block_out_channels[0])

def create_ctrl_pipeline(infer_config, device):
    """create mimicmotion pipeline and load pretrained weight

    Args:
        infer_config (str): 
        device (str or torch.device): "cpu" or "cuda:{device_id}"
    """
    mimicmotion_models = MimicMotionModel(infer_config.base_model_path)
    mimicmotion_models.load_state_dict(torch.load(infer_config.ckpt_path, map_location="cpu"), strict=False)
    controlnet = ControlNetSVDModel.from_unet(mimicmotion_models.unet).to(device=mimicmotion_models.unet.device)
    controlnet.load_state_dict(torch.load(infer_config.controlnet_path, map_location="cpu"),strict=False)
    pipeline = Ctrl_Pipeline(
        vae=mimicmotion_models.vae, 
        image_encoder=mimicmotion_models.image_encoder, 
        unet=mimicmotion_models.unet, 
        controlnet=controlnet,
        scheduler=mimicmotion_models.noise_scheduler,
        feature_extractor=mimicmotion_models.feature_extractor, 
        pose_net=mimicmotion_models.pose_net
    )
    return pipeline

def create_pipeline(infer_config, device):
    """create mimicmotion pipeline and load pretrained weight

    Args:
        infer_config (str): 
        device (str or torch.device): "cpu" or "cuda:{device_id}"
    """
    mimicmotion_models = MimicMotionModel(infer_config.base_model_path)
    # .to(device=device).eval()
    mimicmotion_models.load_state_dict(torch.load(infer_config.ckpt_path, map_location="cpu"), strict=False)
    pipeline = MimicMotionPipeline(
        vae=mimicmotion_models.vae, 
        image_encoder=mimicmotion_models.image_encoder, 
        unet=mimicmotion_models.unet, 
        scheduler=mimicmotion_models.noise_scheduler,
        feature_extractor=mimicmotion_models.feature_extractor, 
        pose_net=mimicmotion_models.pose_net
    )
    return pipeline


================================================
FILE: mimicmotion/utils/utils.py
================================================
import logging
from pathlib import Path
import av
from PIL import Image
import os
from scipy.interpolate import PchipInterpolator
import numpy as np
import pdb
import torch
import torch.nn.functional as F
from torchvision.io import write_video

logger = logging.getLogger(__name__)

@torch.no_grad()
def get_cmp_flow(cmp, frames, sparse_optical_flow, mask):
    '''
        frames: [b, 13, 3, 384, 384] (0, 1) tensor
        sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
        mask: [b, 13, 2, 384, 384] {0, 1} tensor
    '''
    # print(frames.shape)
    dtype = frames.dtype
    b, t, c, h, w = sparse_optical_flow.shape
    assert h == 384 and w == 384
    frames = frames.flatten(0, 1)  # [b*13, 3, 256, 256]
    sparse_optical_flow = sparse_optical_flow.flatten(0, 1)  # [b*13, 2, 256, 256]
    mask = mask.flatten(0, 1)  # [b*13, 2, 256, 256]

    # print(frames.shape)
    # print(sparse_optical_flow.shape)
    # print(mask.shape)

    # assert False

    cmp_flow = []
    for i in range(b*t):
        tmp_flow = cmp.run(frames[i:i+1].float(), sparse_optical_flow[i:i+1].float(), mask[i:i+1].float())  # [b*13, 2, 256, 256]
        cmp_flow.append(tmp_flow)
    cmp_flow = torch.cat(cmp_flow, dim=0)
    cmp_flow = cmp_flow.reshape(b, t, 2, h, w)

    return cmp_flow.to(dtype=dtype)


def sample_optical_flow(A, B, h, w):
    b, l, k, _ = A.shape

    sparse_optical_flow = torch.zeros((b, l, h, w, 2), dtype=B.dtype, device=B.device)
    mask = torch.zeros((b, l, h, w), dtype=torch.uint8, device=B.device)

    x_coords = A[..., 0].long()
    y_coords = A[..., 1].long()

    x_coords = torch.clip(x_coords, 0, h - 1)
    y_coords = torch.clip(y_coords, 0, w - 1)

    b_idx = torch.arange(b)[:, None, None].repeat(1, l, k)
    l_idx = torch.arange(l)[None, :, None].repeat(b, 1, k)

    sparse_optical_flow[b_idx, l_idx, x_coords, y_coords] = B

    mask[b_idx, l_idx, x_coords, y_coords] = 1

    mask = mask.unsqueeze(-1).repeat(1, 1, 1, 1, 2)

    return sparse_optical_flow, mask


@torch.no_grad()
def get_sparse_flow(poses, h, w, t):

    poses = torch.flip(poses, dims=[3])

    pose_flow = (poses - poses[:, 0:1].repeat(1, t, 1, 1))[:, 1:]  # 前向光流
    according_poses = poses[:, 0:1].repeat(1, t - 1, 1, 1)
    
    pose_flow = torch.flip(pose_flow, dims=[3])

    b, t, K, _ = pose_flow.shape

    sparse_optical_flow, mask = sample_optical_flow(according_poses, pose_flow, h, w)

    return sparse_optical_flow.permute(0, 1, 4, 2, 3), mask.permute(0, 1, 4, 2, 3)

def sample_inputs_flow(first_frame, poses, poses_subset):

    pb, pc, ph, pw = first_frame.shape
    
    # print(poses.shape)

    pl = poses.shape[1]

    sparse_optical_flow, mask = get_sparse_flow(poses, ph, pw, pl)

    if ph != 384 or pw != 384:

        first_frame_384 = F.interpolate(first_frame, (384, 384))  # [3, 384, 384]

        poses_384 = torch.zeros_like(poses)
        poses_384[:, :, :, 0] = poses[:, :, :, 0] / pw * 384
        poses_384[:, :, :, 1] = poses[:, :, :, 1] / ph * 384

        sparse_optical_flow_384, mask_384 = get_sparse_flow(poses_384, 384, 384, pl)
    
    else:
        first_frame_384, poses_384 = first_frame, poses
        sparse_optical_flow_384, mask_384 = sparse_optical_flow, mask
    
    controlnet_image = first_frame

    return controlnet_image, sparse_optical_flow, mask, first_frame_384, sparse_optical_flow_384, mask_384

def pose2track(points_list, height, width):
    track_points = np.zeros((18, len(points_list), 2)) # 18 x f x 2
    track_points_subsets = np.zeros((18, len(points_list), 1)) # 18 x f x 2
    for f in range(len(points_list)):
        candidates, subsets, scores = points_list[f]['candidate'], points_list[f]['subset'][0], points_list[f]['score']
        for i in range(18):
            if subsets[i] == -1:
                track_points_subsets[i][f] = -1
            else:
                # track_points[i][f][0] = candidates[i][0]
                # track_points[i][f][1] = candidates[i][1]
                track_points[i][f][0] = max(min(candidates[i][0] * width, width-1), 0)
                track_points[i][f][1] = max(min(candidates[i][1] * height, height-1), 0)
                track_points_subsets[i][f] = i
    
    return track_points, track_points_subsets

def pose2track_batch(points_list, height, width, batch_size):
    track_points = np.zeros((batch_size, 18, len(points_list), 2)) # 18 x f x 2
    track_points_subsets = np.zeros((batch_size, 18, len(points_list), 1)) # 18 x f x 2
    for batch_idx in range(batch_size):
        for f in range(len(points_list)):
            candidates, subsets, scores = points_list[f]['candidate'][batch_idx], points_list[f]['subset'][batch_idx][0], points_list[f]['score'][batch_idx]
            for i in range(18):
                if subsets[i] == -1:
                    track_points_subsets[batch_idx][i][f] = -1
                else:
                    # track_points[i][f][0] = candidates[i][0]
                    # track_points[i][f][1] = candidates[i][1]
                    track_points[batch_idx][i][f][0] = max(min(candidates[i][0] * width, width-1), 0)
                    track_points[batch_idx][i][f][1] = max(min(candidates[i][1] * height, height-1), 0)
                    track_points_subsets[batch_idx][i][f] = i
    
    return track_points, track_points_subsets

def points_to_flows_batch(points_list, model_length, height, width, batch_size):

    track_points, track_points_subsets = pose2track_batch(points_list, height, width, batch_size)
    # model_length = track_points.shape[1]
    input_drag = np.zeros((batch_size, model_length - 1, height, width, 2))
    for batch_idx in range(batch_size):
        for splited_track, points_subset in zip(track_points[batch_idx], track_points_subsets[batch_idx]):
            if len(splited_track) == 1: # stationary point
                displacement_point = tuple([splited_track[0][0] + 1, splited_track[0][1] + 1])
                splited_track = tuple([splited_track[0], displacement_point])
            # interpolate the track
            # splited_track = interpolate_trajectory(splited_track, model_length)
            # splited_track = splited_track[:model_length]
            if len(splited_track) < model_length:
                splited_track = splited_track + [splited_track[-1]] * (model_length -len(splited_track))
            for i in range(model_length - 1):
                if points_subset[i]!=-1:
                    start_point = splited_track[i]
                    end_point = splited_track[i+1]
                    input_drag[batch_idx][i][int(start_point[1])][int(start_point[0])][0] = end_point[0] - start_point[0]
                    input_drag[batch_idx][i][int(start_point[1])][int(start_point[0])][1] = end_point[1] - start_point[1]
    return input_drag

def points_to_flows(points_list, model_length, height, width):
    
    track_points, track_points_subsets = pose2track(points_list, height, width)
    # model_length = track_points.shape[1]
    input_drag = np.zeros((model_length - 1, height, width, 2))

    for splited_track, points_subset in zip(track_points, track_points_subsets):
        if len(splited_track) == 1: # stationary point
            displacement_point = tuple([splited_track[0][0] + 1, splited_track[0][1] + 1])
            splited_track = tuple([splited_track[0], displacement_point])
        # interpolate the track
        # splited_track = interpolate_trajectory(splited_track, model_length)
        # splited_track = splited_track[:model_length]
        if len(splited_track) < model_length:
            splited_track = splited_track + [splited_track[-1]] * (model_length -len(splited_track))
        for i in range(model_length - 1):
            if points_subset[i]!=-1:
                start_point = splited_track[i]
                end_point = splited_track[i+1]
                input_drag[i][int(start_point[1])][int(start_point[0])][0] = end_point[0] - start_point[0]
                input_drag[i][int(start_point[1])][int(start_point[0])][1] = end_point[1] - start_point[1]
    return input_drag

def interpolate_trajectory(points, n_points):
    x = [point[0] for point in points]
    y = [point[1] for point in points]

    t = np.linspace(0, 1, len(points))

    fx = PchipInterpolator(t, x)
    fy = PchipInterpolator(t, y)

    new_t = np.linspace(0, 1, n_points)

    new_x = fx(new_t)
    new_y = fy(new_t)
    new_points = list(zip(new_x, new_y))

    return new_points


def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
    """Generate a bivariate isotropic or anisotropic Gaussian kernel.
    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
    Args:
        kernel_size (int):
        sig_x (float):
        sig_y (float):
        theta (float): Radian measurement.
        grid (ndarray, optional): generated by :func:`mesh_grid`,
            with the shape (K, K, 2), K is the kernel size. Default: None
        isotropic (bool):
    Returns:
        kernel (ndarray): normalized kernel.
    """
    if grid is None:
        grid, _, _ = mesh_grid(kernel_size)
    if isotropic:
        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
    else:
        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
    kernel = pdf2(sigma_matrix, grid)
    kernel = kernel / np.sum(kernel)
    return kernel

def mesh_grid(kernel_size):
    """Generate the mesh grid, centering at zero.
    Args:
        kernel_size (int):
    Returns:
        xy (ndarray): with the shape (kernel_size, kernel_size, 2)
        xx (ndarray): with the shape (kernel_size, kernel_size)
        yy (ndarray): with the shape (kernel_size, kernel_size)
    """
    ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
    xx, yy = np.meshgrid(ax, ax)
    xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size,
                                                                           1))).reshape(kernel_size, kernel_size, 2)
    return xy, xx, yy


def pdf2(sigma_matrix, grid):
    """Calculate PDF of the bivariate Gaussian distribution.
    Args:
        sigma_matrix (ndarray): with the shape (2, 2)
        grid (ndarray): generated by :func:`mesh_grid`,
            with the shape (K, K, 2), K is the kernel size.
    Returns:
        kernel (ndarrray): un-normalized kernel.
    """
    inverse_sigma = np.linalg.inv(sigma_matrix)
    kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
    return kernel


def sigma_matrix2(sig_x, sig_y, theta):
    """Calculate the rotated sigma matrix (two dimensional matrix).
    Args:
        sig_x (float):
        sig_y (float):
        theta (float): Radian measurement.
    Returns:
        ndarray: Rotated sigma matrix.
    """
    d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
    u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
    return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))


def save_to_mp4(frames, save_path, fps=7):
    frames = frames.permute((0, 2, 3, 1))  # (f, c, h, w) to (f, h, w, c)
    Path(save_path).parent.mkdir(parents=True, exist_ok=True)
    write_video(save_path, frames, fps=fps)

def read_frames(video_path):
    container = av.open(video_path)

    video_stream = next(s for s in container.streams if s.type == "video")
    frames = []
    for packet in container.demux(video_stream):
        for frame in packet.decode():
            image = Image.frombytes(
                "RGB",
                (frame.width, frame.height),
                frame.to_rgb().to_ndarray(),
            )
            frames.append(image)

    return frames

def get_fps(video_path):
    container = av.open(video_path)
    video_stream = next(s for s in container.streams if s.type == "video")
    fps = video_stream.average_rate
    container.close()
    return fps


def save_videos_from_pil(pil_images, path, fps=8):
    import av

    save_fmt = Path(path).suffix
    os.makedirs(os.path.dirname(path), exist_ok=True)
    width, height = pil_images[0].size

    if save_fmt == ".mp4":
        codec = "libx264"
        container = av.open(path, "w")
        stream = container.add_stream(codec, rate=fps)

        stream.width = width
        stream.height = height
        stream.pix_fmt = 'yuv420p'
        stream.bit_rate = 10000000   
        stream.options["crf"] = "18"

        for pil_image in pil_images:
            # pil_image = Image.fromarray(image_arr).convert("RGB")
            av_frame = av.VideoFrame.from_image(pil_image)
            container.mux(stream.encode(av_frame))
        container.mux(stream.encode())
        container.close()

    elif save_fmt == ".gif":
        pil_images[0].save(
            fp=path,
            format="GIF",
            append_images=pil_images[1:],
            save_all=True,
            duration=(1 / fps * 1000),
            loop=0,
        )
    else:
        raise ValueError("Unsupported file type. Use .mp4 or .gif.")


================================================
FILE: mimicmotion/utils/visualizer.py
================================================

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
import imageio
import torch

from matplotlib import cm
import torch.nn.functional as F
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw


def read_video_from_path(path):
    try:
        reader = imageio.get_reader(path)
    except Exception as e:
        print("Error opening video file: ", e)
        return None
    frames = []
    for i, im in enumerate(reader):
        frames.append(np.array(im))
    return np.stack(frames)


def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True):
    # Create a draw object
    draw = ImageDraw.Draw(rgb)
    # Calculate the bounding box of the circle
    left_up_point = (coord[0] - radius, coord[1] - radius)
    right_down_point = (coord[0] + radius, coord[1] + radius)
    # Draw the circle
    draw.ellipse(
        [left_up_point, right_down_point],
        fill=tuple(color) if visible else None,
        outline=tuple(color),
    )
    return rgb


def draw_line(rgb, coord_y, coord_x, color, linewidth):
    draw = ImageDraw.Draw(rgb)
    draw.line(
        (coord_y[0], coord_y[1], coord_x[0], coord_x[1]),
        fill=tuple(color),
        width=linewidth,
    )
    return rgb


def add_weighted(rgb, alpha, original, beta, gamma):
    return (rgb * alpha + original * beta + gamma).astype("uint8")


class Visualizer:
    def __init__(
        self,
        save_dir: str = "./results",
        grayscale: bool = False,
        pad_value: int = 0,
        fps: int = 8,
        mode: str = "rainbow",  # 'cool', 'optical_flow'
        linewidth: int = 2,
        show_first_frame: int = 0,
        tracks_leave_trace: int = 0,  # -1 for infinite
    ):
        self.mode = mode
        self.save_dir = save_dir
        if mode == "rainbow":
            self.color_map = cm.get_cmap("gist_rainbow")
        elif mode == "cool":
            self.color_map = cm.get_cmap(mode)
        self.show_first_frame = show_first_frame
        self.grayscale = grayscale
        self.tracks_leave_trace = tracks_leave_trace
        self.pad_value = pad_value
        self.linewidth = linewidth
        self.fps = fps

    def visualize(
        self,
        video: torch.Tensor,  # (B,T,C,H,W)
        tracks: torch.Tensor,  # (B,T,N,2)
        visibility: torch.Tensor = None,  # (B, T, N, 1) bool
        gt_tracks: torch.Tensor = None,  # (B,T,N,2)
        segm_mask: torch.Tensor = None,  # (B,1,H,W)
        filename: str = "video",
        writer=None,  # tensorboard Summary Writer, used for visualization during training
        step: int = 0,
        query_frame: int = 0,
        save_video: bool = True,
        compensate_for_camera_motion: bool = False,
    ):
        if compensate_for_camera_motion:
            assert segm_mask is not None
        if segm_mask is not None:
            coords = tracks[0, query_frame].round().long()
            segm_mask = segm_mask[0, query_frame][coords[:, 1], coords[:, 0]].long()

        video = F.pad(
            video,
            (self.pad_value, self.pad_value, self.pad_value, self.pad_value),
            "constant",
            255,
        )
        tracks = tracks + self.pad_value

        if self.grayscale:
            transform = transforms.Grayscale()
            video = transform(video)
            video = video.repeat(1, 1, 3, 1, 1)

        res_video = self.draw_tracks_on_video(
            video=video,
            tracks=tracks,
            visibility=visibility,
            segm_mask=segm_mask,
            gt_tracks=gt_tracks,
            query_frame=query_frame,
            compensate_for_camera_motion=compensate_for_camera_motion,
        )
        if save_video:
            self.save_video(res_video, filename=filename, writer=writer, step=step)
        return res_video

    def save_video(self, video, filename, writer=None, step=0):
        if writer is not None:
            writer.add_video(
                filename,
                video.to(torch.uint8),
                global_step=step,
                fps=self.fps,
            )
        else:
            os.makedirs(self.save_dir, exist_ok=True)
            wide_list = list(video.unbind(1))
            wide_list = [wide[0].permute(1, 2, 0).cpu().numpy() for wide in wide_list]

            # Prepare the video file path
            save_path = os.path.join(self.save_dir, f"{filename}.mp4")

            # Create a writer object
            video_writer = imageio.get_writer(save_path, fps=self.fps)

            # Write frames to the video file
            for frame in wide_list[2:-1]:
                video_writer.append_data(frame)

            video_writer.close()

            # print(f"Video saved to {save_path}")

    def draw_tracks_on_video(
        self,
        video: torch.Tensor,
        tracks: torch.Tensor,
        visibility: torch.Tensor = None,
        segm_mask: torch.Tensor = None,
        gt_tracks=None,
        query_frame: int = 0,
        compensate_for_camera_motion=False,
    ):
        B, T, C, H, W = video.shape
        _, _, N, D = tracks.shape

        assert D == 2
        assert C == 3
        video = video[0].permute(0, 2, 3, 1).byte().detach().cpu().numpy()  # S, H, W, C
        tracks = tracks[0].long().detach().cpu().numpy()  # S, N, 2
        if gt_tracks is not None:
            gt_tracks = gt_tracks[0].detach().cpu().numpy()

        res_video = []

        # process input video
        for rgb in video:
            res_video.append(rgb.copy())
        vector_colors = np.zeros((T, N, 3))

        if self.mode == "optical_flow":
            import flow_vis

            vector_colors = flow_vis.flow_to_color(tracks - tracks[query_frame][None])
        elif segm_mask is None:
            if self.mode == "rainbow":
                y_min, y_max = (
                    tracks[query_frame, :, 1].min(),
                    tracks[query_frame, :, 1].max(),
                )
                norm = plt.Normalize(y_min, y_max)
                for n in range(N):
                    color = self.color_map(norm(tracks[query_frame, n, 1]))
                    color = np.array(color[:3])[None] * 255
                    vector_colors[:, n] = np.repeat(color, T, axis=0)
            else:
                # color changes with time
                for t in range(T):
                    color = np.array(self.color_map(t / T)[:3])[None] * 255
                    vector_colors[t] = np.repeat(color, N, axis=0)
        else:
            if self.mode == "rainbow":
                vector_colors[:, segm_mask <= 0, :] = 255

                y_min, y_max = (
                    tracks[0, segm_mask > 0, 1].min(),
                    tracks[0, segm_mask > 0, 1].max(),
                )
                norm = plt.Normalize(y_min, y_max)
                for n in range(N):
                    if segm_mask[n] > 0:
                        color = self.color_map(norm(tracks[0, n, 1]))
                        color = np.array(color[:3])[None] * 255
                        vector_colors[:, n] = np.repeat(color, T, axis=0)

            else:
                # color changes with segm class
                segm_mask = segm_mask.cpu()
                color = np.zeros((segm_mask.shape[0], 3), dtype=np.float32)
                color[segm_mask > 0] = np.array(self.color_map(1.0)[:3]) * 255.0
                color[segm_mask <= 0] = np.array(self.color_map(0.0)[:3]) * 255.0
                vector_colors = np.repeat(color[None], T, axis=0)

        #  draw tracks
        if self.tracks_leave_trace != 0:
            for t in range(query_frame + 1, T):
                first_ind = (
                    max(0, t - self.tracks_leave_trace) if self.tracks_leave_trace >= 0 else 0
                )
                curr_tracks = tracks[first_ind : t + 1]
                curr_colors = vector_colors[first_ind : t + 1]
                if compensate_for_camera_motion:
                    diff = (
                        tracks[first_ind : t + 1, segm_mask <= 0]
                        - tracks[t : t + 1, segm_mask <= 0]
                    ).mean(1)[:, None]

                    curr_tracks = curr_tracks - diff
                    curr_tracks = curr_tracks[:, segm_mask > 0]
                    curr_colors = curr_colors[:, segm_mask > 0]

                res_video[t] = self._draw_pred_tracks(
                    res_video[t],
                    curr_tracks,
                    curr_colors,
                )
                if gt_tracks is not None:
                    res_video[t] = self._draw_gt_tracks(res_video[t], gt_tracks[first_ind : t + 1])

        #  draw points
        for t in range(query_frame, T):
            img = Image.fromarray(np.uint8(res_video[t]))
            for i in range(N):
                coord = (tracks[t, i, 0], tracks[t, i, 1])
                visibile = True
                if visibility is not None:
                    visibile = visibility[0, t, i]
                if coord[0] != 0 and coord[1] != 0:
                    if not compensate_for_camera_motion or (
                        compensate_for_camera_motion and segm_mask[i] > 0
                    ):
                        img = draw_circle(
                            img,
                            coord=coord,
                            radius=int(self.linewidth * 2),
                            color=vector_colors[t, i].astype(int),
                            visible=visibile,
                        )
            res_video[t] = np.array(img)

        #  construct the final rgb sequence
        if self.show_first_frame > 0:
            res_video = [res_video[0]] * self.show_first_frame + res_video[1:]
        return torch.from_numpy(np.stack(res_video)).permute(0, 3, 1, 2)[None].byte()

    def _draw_pred_tracks(
        self,
        rgb: np.ndarray,  # H x W x 3
        tracks: np.ndarray,  # T x 2
        vector_colors: np.ndarray,
        alpha: float = 0.5,
    ):
        T, N, _ = tracks.shape
        rgb = Image.fromarray(np.uint8(rgb))
        for s in range(T - 1):
            vector_color = vector_colors[s]
            original = rgb.copy()
            alpha = (s / T) ** 2
            for i in range(N):
                coord_y = (int(tracks[s, i, 0]), int(tracks[s, i, 1]))
                coord_x = (int(tracks[s + 1, i, 0]), int(tracks[s + 1, i, 1]))
                if coord_y[0] != 0 and coord_y[1] != 0:
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        vector_color[i].astype(int),
                        self.linewidth,
                    )
            if self.tracks_leave_trace > 0:
                rgb = Image.fromarray(
                    np.uint8(add_weighted(np.array(rgb), alpha, np.array(original), 1 - alpha, 0))
                )
        rgb = np.array(rgb)
        return rgb

    def _draw_gt_tracks(
        self,
        rgb: np.ndarray,  # H x W x 3,
        gt_tracks: np.ndarray,  # T x 2
    ):
        T, N, _ = gt_tracks.shape
        color = np.array((211, 0, 0))
        rgb = Image.fromarray(np.uint8(rgb))
        for t in range(T):
            for i in range(N):
                gt_tracks = gt_tracks[t][i]
                #  draw a red cross
                if gt_tracks[0] > 0 and gt_tracks[1] > 0:
                    length = self.linewidth * 3
                    coord_y = (int(gt_tracks[0]) + length, int(gt_tracks[1]) + length)
                    coord_x = (int(gt_tracks[0]) - length, int(gt_tracks[1]) - length)
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        color,
                        self.linewidth,
                    )
                    coord_y = (int(gt_tracks[0]) - length, int(gt_tracks[1]) + length)
                    coord_x = (int(gt_tracks[0]) + length, int(gt_tracks[1]) - length)
                    rgb = draw_line(
                        rgb,
                        coord_y,
                        coord_x,
                        color,
                        self.linewidth,
                    )
        rgb = np.array(rgb)
        return rgb


########## optical flow visualization ########## 

UNKNOWN_FLOW_THRESH = 1e7
SMALLFLOW = 0.0
LARGEFLOW = 1e8


def vis_flow_to_video(optical_flow, num_frames):
    '''
    optical_flow: T-1 x H x W x C
    '''
    video = []
    for i in range(1, num_frames):
        flow_img = flow_to_image(optical_flow[i])
        flow_img = torch.Tensor(flow_img) # H x W x 3
        video.append(flow_img)
    video = torch.stack(video, dim=0) # T-1 x H x W x 3
    return video


# from https://github.com/gengshan-y/VCN
def flow_to_image(flow):
    """
    Convert flow into middlebury color code image
    :param flow: optical flow map
    :return: optical flow image in middlebury color
    """
    u = flow[:, :, 0]
    v = flow[:, :, 1]

    maxu = -999.
    maxv = -999.
    minu = 999.
    minv = 999.

    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
    u[idxUnknow] = 0
    v[idxUnknow] = 0

    maxu = max(maxu, np.max(u))
    minu = min(minu, np.min(u))

    maxv = max(maxv, np.max(v))
    minv = min(minv, np.min(v))

    rad = np.sqrt(u ** 2 + v ** 2)
    maxrad = max(-1, np.max(rad))

    u = u / (maxrad + np.finfo(float).eps)
    v = v / (maxrad + np.finfo(float).eps)

    img = compute_color(u, v)

    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
    img[idx] = 0

    return np.uint8(img)


def compute_color(u, v):
    """
    compute optical flow color map
    :param u: optical flow horizontal map
    :param v: optical flow vertical map
    :return: optical flow in color code
    """
    [h, w] = u.shape
    img = np.zeros([h, w, 3])
    nanIdx = np.isnan(u) | np.isnan(v)
    u[nanIdx] = 0
    v[nanIdx] = 0

    colorwheel = make_color_wheel()
    ncols = np.size(colorwheel, 0)

    rad = np.sqrt(u ** 2 + v ** 2)

    a = np.arctan2(-v, -u) / np.pi

    fk = (a + 1) / 2 * (ncols - 1) + 1

    k0 = np.floor(fk).astype(int)

    k1 = k0 + 1
    k1[k1 == ncols + 1] = 1
    f = fk - k0

    for i in range(0, np.size(colorwheel, 1)):
        tmp = colorwheel[:, i]
        col0 = tmp[k0 - 1] / 255
        col1 = tmp[k1 - 1] / 255
        col = (1 - f) * col0 + f * col1

        idx = rad <= 1
        col[idx] = 1 - rad[idx] * (1 - col[idx]) # 光流越小，颜色越亮。这样可以使得静止或者运动较慢的区域在可视化结果中更加明显
        notidx = np.logical_not(idx) 

        col[notidx] *= 0.75 # 光流越大，颜色越暗
        img[:, :, i] = np.uint8(np.floor(255 * col * (1 - nanIdx)))

    return img


def make_color_wheel():
    """
    Generate color wheel according Middlebury color code
    :return: Color wheel
    """
    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR

    colorwheel = np.zeros([ncols, 3])

    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.transpose(np.floor(255 * np.arange(0, RY) / RY))
    col += RY

    # YG
    colorwheel[col:col + YG, 0] = 255 - np.transpose(np.floor(255 * np.arange(0, YG) / YG))
    colorwheel[col:col + YG, 1] = 255
    col += YG

    # GC
    colorwheel[col:col + GC, 1] = 255
    colorwheel[col:col + GC, 2] = np.transpose(np.floor(255 * np.arange(0, GC) / GC))
    col += GC

    # CB
    colorwheel[col:col + CB, 1] = 255 - np.transpose(np.floor(255 * np.arange(0, CB) / CB))
    colorwheel[col:col + CB, 2] = 255
    col += CB

    # BM
    colorwheel[col:col + BM, 2] = 255
    colorwheel[col:col + BM, 0] = np.transpose(np.floor(255 * np.arange(0, BM) / BM))
    col += + BM

    # MR
    colorwheel[col:col + MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
    colorwheel[col:col + MR, 0] = 255

    return colorwheel


================================================
FILE: requirements.txt
================================================
accelerate
torch
torchvision
Pillow
numpy
omegaconf
decord
einops
matplotlib
diffusers==0.27.0
scipy
av==12.0.0
imageio
opencv_contrib_python
transformers
huggingface_hub==0.25.2
onnxruntime


================================================
FILE: scripts/test.sh
================================================
CUDA_VISIBLE_DEVICES=0 python inference_ctrl.py \
    --inference_config configs/test.yaml \
    --name test \
    # --no_use_float16 for V100