Repository: a1600012888/PhysDreamer
Branch: main
Commit: 95beb71f9f1b
Files: 176
Total size: 1.4 MB

Directory structure:
gitextract_1v1oa6eg/

├── .gitignore
├── README.md
├── physdreamer/
│   ├── field_components/
│   │   ├── encoding.py
│   │   └── mlp.py
│   ├── fields/
│   │   ├── mul_offset_field.py
│   │   ├── mul_se3_field.py
│   │   ├── offset_field.py
│   │   ├── se3_field.py
│   │   └── triplane_field.py
│   ├── gaussian_3d/
│   │   ├── README.md
│   │   ├── arguments/
│   │   │   └── __init__.py
│   │   ├── gaussian_renderer/
│   │   │   ├── __init__.py
│   │   │   ├── depth_uv_render.py
│   │   │   ├── feat_render.py
│   │   │   ├── flow_depth_render.py
│   │   │   └── render.py
│   │   ├── scene/
│   │   │   ├── __init__.py
│   │   │   ├── cameras.py
│   │   │   ├── colmap_loader.py
│   │   │   ├── dataset_readers.py
│   │   │   ├── gaussian_model.py
│   │   │   ├── mesh.py
│   │   │   └── mesh_utils.py
│   │   └── utils/
│   │       ├── camera_utils.py
│   │       ├── general_utils.py
│   │       ├── graphics_utils.py
│   │       ├── image_utils.py
│   │       ├── loss_utils.py
│   │       ├── rigid_body_utils.py
│   │       ├── sh_utils.py
│   │       └── system_utils.py
│   ├── losses/
│   │   └── smoothness_loss.py
│   ├── operators/
│   │   ├── dct.py
│   │   ├── np_operators.py
│   │   └── rotation.py
│   ├── utils/
│   │   ├── camera_utils.py
│   │   ├── colmap_utils.py
│   │   ├── config.py
│   │   ├── img_utils.py
│   │   ├── io_utils.py
│   │   ├── optimizer.py
│   │   ├── print_utils.py
│   │   ├── pytorch_mssim.py
│   │   ├── svd_helpper.py
│   │   └── torch_utils.py
│   └── warp_mpm/
│       ├── README.md
│       ├── gaussian_sim_utils.py
│       ├── mpm_data_structure.py
│       ├── mpm_solver_diff.py
│       ├── mpm_utils.py
│       └── warp_utils.py
├── projects/
│   ├── inference/
│   │   ├── README.md
│   │   ├── config_demo.py
│   │   ├── configs/
│   │   │   ├── alocasia.py
│   │   │   ├── carnation.py
│   │   │   ├── hat.py
│   │   │   └── telephone.py
│   │   ├── demo.py
│   │   ├── local_utils.py
│   │   └── run.sh
│   └── uncleaned_train/
│       ├── .gitignore
│       ├── README.md
│       ├── exp_motion/
│       │   └── train/
│       │       ├── config.yml
│       │       ├── config_demo.py
│       │       ├── convert_gaussian_to_mesh.py
│       │       ├── fast_train_velocity.py
│       │       ├── interface.py
│       │       ├── local_utils.py
│       │       ├── model_config.py
│       │       └── train_material.py
│       ├── motionrep/
│       │   ├── datatools/
│       │   │   ├── _convert_fbx_to_mesh.py
│       │   │   ├── blender_deforming_things4d.py
│       │   │   ├── blender_install_packages.py
│       │   │   ├── blender_render_imgs.py
│       │   │   ├── deforming_things4d.py
│       │   │   ├── dragon_animation.py
│       │   │   ├── fbx_to_mesh.py
│       │   │   ├── fbx_to_mesh_flag.py
│       │   │   ├── render_blender_annimations.py
│       │   │   ├── render_fbx_first_frame.py
│       │   │   ├── render_obj.py
│       │   │   ├── render_obj_external_texture.py
│       │   │   ├── test_colmap_camera.py
│       │   │   └── transform_obj_for_blender.py
│       │   ├── diffusion/
│       │   │   ├── builder.py
│       │   │   ├── discretizer.py
│       │   │   ├── draft.py
│       │   │   ├── gaussian_diffusion.py
│       │   │   ├── losses.py
│       │   │   ├── resample.py
│       │   │   ├── respace.py
│       │   │   ├── sigma_sampling.py
│       │   │   ├── sv_diffusion_engine.py
│       │   │   ├── svd_conditioner.py
│       │   │   ├── svd_sds_engine.py
│       │   │   ├── svd_sds_engine_backup.py
│       │   │   ├── svd_sds_wdecoder_engine.py
│       │   │   └── video_diffusion_loss.py
│       │   ├── field_components/
│       │   │   ├── encoding.py
│       │   │   └── mlp.py
│       │   ├── fields/
│       │   │   ├── dct_trajectory_field.py
│       │   │   ├── discrete_field.py
│       │   │   ├── mul_offset_field.py
│       │   │   ├── mul_se3_field.py
│       │   │   ├── offset_field.py
│       │   │   ├── se3_field.py
│       │   │   ├── triplane_field.py
│       │   │   └── video_triplane_disp_field.py
│       │   ├── gaussian_3d/
│       │   │   ├── arguments/
│       │   │   │   └── __init__.py
│       │   │   ├── gaussian_renderer/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── depth_uv_render.py
│       │   │   │   ├── feat_render.py
│       │   │   │   ├── flow_depth_render.py
│       │   │   │   ├── motion_renderer.py
│       │   │   │   └── render.py
│       │   │   ├── scene/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cameras.py
│       │   │   │   ├── colmap_loader.py
│       │   │   │   ├── dataset_readers.py
│       │   │   │   ├── gaussian_model.py
│       │   │   │   ├── mesh.py
│       │   │   │   └── mesh_utils.py
│       │   │   └── utils/
│       │   │       ├── camera_utils.py
│       │   │       ├── general_utils.py
│       │   │       ├── graphics_utils.py
│       │   │       ├── image_utils.py
│       │   │       ├── loss_utils.py
│       │   │       ├── rigid_body_utils.py
│       │   │       ├── sh_utils.py
│       │   │       └── system_utils.py
│       │   ├── losses/
│       │   │   ├── se3_loss.py
│       │   │   └── smoothness_loss.py
│       │   ├── operators/
│       │   │   ├── dct.py
│       │   │   ├── np_operators.py
│       │   │   └── rotation.py
│       │   └── utils/
│       │       ├── camera_utils.py
│       │       ├── colmap_utils.py
│       │       ├── config.py
│       │       ├── dct.py
│       │       ├── flow_utils.py
│       │       ├── img_utils.py
│       │       ├── io_utils.py
│       │       ├── optimizer.py
│       │       ├── peft_utils.py
│       │       ├── print_utils.py
│       │       ├── pytorch_mssim.py
│       │       ├── svd_helpper.py
│       │       └── torch_utils.py
│       └── thirdparty_code/
│           └── warp_mpm/
│               ├── backup/
│               │   ├── convert_gaussian_to_mesh.py
│               │   ├── diff_warp_utils.py
│               │   ├── engine_utils.py
│               │   ├── grad_test.py
│               │   ├── mpm_solver_warp.py
│               │   ├── mpm_solver_warp_diff.py
│               │   ├── mpm_utils.py
│               │   ├── run_gaussian.py
│               │   ├── run_gaussian_static.py
│               │   ├── run_sand.py
│               │   ├── sim_grad.py
│               │   ├── solver_grad_test.py
│               │   ├── test_inverse_sim.py
│               │   ├── test_sim.py
│               │   ├── warp_rewrite.py
│               │   └── warp_utils.py
│               ├── backup_jan10/
│               │   ├── gaussian_sim_utils.py
│               │   ├── mpm_data_structure.py
│               │   ├── mpm_solver_diff.py
│               │   ├── mpm_utils.py
│               │   └── warp_utils.py
│               ├── gaussian_sim_utils.py
│               ├── mpm_data_structure.py
│               ├── mpm_solver_diff.py
│               ├── mpm_utils.py
│               └── warp_utils.py
├── requirements.txt
└── setup.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.pyc

models/
data/
output/
wandb/


================================================
FILE: README.md
================================================
# PhysDreamer: Physics-Based Interaction with 3D Objects via Video Generation [[website](https://physdreamer.github.io/)]

![teaser-figure](figures/figure_teaser.png)

## Useage

### Setup enviroment

Install diff-gaussian-rasterization at: https://github.com/graphdeco-inria/diff-gaussian-rasterization
   
```bash
conda create -n physdreamer python
conda activate physdreamer

pip install -r requirements.txt

python setup.py install
```

### Download the scenes and optimized models from Hugging Face

Download the scenes and optimized velocity and material fields from: https://huggingface.co/datasets/YunjinZhang/PhysDreamer/tree/main

Put folders of these scenes to `data/physics_dreamer/xxx`, e.g. `data/physics_dreamer/carnations`

Put pretrained models to `./models`. 

See `dataset_dir` and `model_list` in  `inference/configs/carnation.py` to match the path of dataset and pretrained models. 


### Run inference

```bash
cd projects/inference
bash run.sh
```


## Acknowledgement
This codebase used lots of source code from: 
1. https://github.com/graphdeco-inria/gaussian-splatting
2. https://github.com/zeshunzong/warp-mpm
3. https://github.com/PingchuanMa/NCLaw

We thank the authors of these projects.


## Citations
```
@article{zhang2024physdreamer,
    title={{PhysDreamer}: Physics-Based Interaction with 3D Objects via Video Generation},
    author={Tianyuan Zhang and Hong-Xing Yu and Rundi Wu and
            Brandon Y. Feng and Changxi Zheng and Noah Snavely and Jiajun Wu and William T. Freeman},
    journal={arxiv},
    year={2024}
}
```


================================================
FILE: physdreamer/field_components/encoding.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Optional, Sequence, Tuple, List
from physdreamer.losses.smoothness_loss import (
    compute_plane_smoothness,
    compute_plane_tv,
)


class TemporalKplanesEncoding(nn.Module):
    """

    Args:
        resolutions (Sequence[int]): xyzt resolutions.
    """

    def __init__(
        self,
        resolutions: Sequence[int],
        feat_dim: int = 32,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  # Literal["sum", "product", "cat"] = "sum",
    ):
        super().__init__()

        self.resolutions = resolutions

        if reduce == "cat":
            feat_dim = feat_dim // 3
        self.feat_dim = feat_dim

        self.reduce = reduce

        self.in_dim = 4

        self.plane_coefs = nn.ParameterList()

        self.coo_combs = [[0, 3], [1, 3], [2, 3]]
        # [(x, t), (y, t), (z, t)]
        for coo_comb in self.coo_combs:
            # [feat_dim, time_resolution, spatial_resolution]
            new_plane_coef = nn.Parameter(
                torch.empty(
                    [
                        self.feat_dim,
                        resolutions[coo_comb[1]],
                        resolutions[coo_comb[0]],  # flip?
                    ]
                )
            )

            # when init to ones?

            nn.init.uniform_(new_plane_coef, a=init_a, b=init_b)
            self.plane_coefs.append(new_plane_coef)

    def forward(self, inp: Float[Tensor, "*bs 4"]):
        output = 1.0 if self.reduce == "product" else 0.0
        if self.reduce == "cat":
            output = []
        for ci, coo_comb in enumerate(self.coo_combs):
            grid = self.plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(self.feat_dim, -1).T  # [flattened_bs, output_dim]

            if self.reduce == "product":
                output = output * interp
            elif self.reduce == "sum":
                output = output + interp
            elif self.reduce == "cat":
                output.append(interp)

        if self.reduce == "cat":
            # [flattened_bs, output_dim * 3]
            output = torch.cat(output, dim=-1)

        return output

    def compute_temporal_smoothness(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_smoothness(plane_coef)

        return ret_loss

    def compute_plane_tv(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_tv(plane_coef)

        return ret_loss

    def visualize(
        self,
    ) -> Tuple[Float[Tensor, "3 H W"]]:
        """Visualize the encoding as a RGB images

        Returns:
            Tuple[Float[Tensor, "3 H W"]]
        """
        pass

    @staticmethod
    def functional_forward(
        plane_coefs: List[Float[Tensor, "feat_dim H W"]],
        inp: Float[Tensor, "*bs 4"],
        reduce: str = "sum",
        coo_combs: Optional[List[List[int]]] = [[0, 3], [1, 3], [2, 3]],
    ):
        assert reduce in ["sum", "product", "cat"]
        output = 1.0 if reduce == "product" else 0.0

        if reduce == "cat":
            output = []
        for ci, coo_comb in enumerate(coo_combs):
            grid = plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            feat_dim = grid.shape[1]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(feat_dim, -1).T  # [flattened_bs, output_dim]

            if reduce == "product":
                output = output * interp
            elif reduce == "sum":
                output = output + interp
            elif reduce == "cat":
                output.append(interp)

        if reduce == "cat":
            # [flattened_bs, output_dim * 3]
            output = torch.cat(output, dim=-1)

        return output


class TriplanesEncoding(nn.Module):
    """

    Args:
        resolutions (Sequence[int]): xyz resolutions.
    """

    def __init__(
        self,
        resolutions: Sequence[int],
        feat_dim: int = 32,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  # Literal["sum", "product", "cat"] = "sum",
    ):
        super().__init__()

        self.resolutions = resolutions

        if reduce == "cat":
            feat_dim = feat_dim  #  // 3
        self.feat_dim = feat_dim

        self.reduce = reduce

        self.in_dim = 3

        self.plane_coefs = nn.ParameterList()

        self.coo_combs = [[0, 1], [0, 2], [1, 2]]
        # [(x, t), (y, t), (z, t)]
        for coo_comb in self.coo_combs:
            new_plane_coef = nn.Parameter(
                torch.empty(
                    [
                        self.feat_dim,
                        resolutions[coo_comb[1]],
                        resolutions[coo_comb[0]],
                    ]
                )
            )

            # when init to ones?

            nn.init.uniform_(new_plane_coef, a=init_a, b=init_b)
            self.plane_coefs.append(new_plane_coef)

    def forward(self, inp: Float[Tensor, "*bs 3"]):
        output = 1.0 if self.reduce == "product" else 0.0
        if self.reduce == "cat":
            output = []
        for ci, coo_comb in enumerate(self.coo_combs):
            grid = self.plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(self.feat_dim, -1).T  # [flattened_bs, output_dim]

            if self.reduce == "product":
                output = output * interp
            elif self.reduce == "sum":
                output = output + interp
            elif self.reduce == "cat":
                output.append(interp)

        if self.reduce == "cat":
            # [flattened_bs, output_dim * 3]
            output = torch.cat(output, dim=-1)

        return output

    def compute_plane_tv(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_tv(plane_coef)

        return ret_loss


class PlaneEncoding(nn.Module):
    """

    Args:
        resolutions (Sequence[int]): xyz resolutions.
    """

    def __init__(
        self,
        resolutions: Sequence[int],  # [y_res, x_res]
        feat_dim: int = 32,
        init_a: float = 0.1,
        init_b: float = 0.5,
    ):
        super().__init__()

        self.resolutions = resolutions

        self.feat_dim = feat_dim
        self.in_dim = 2

        self.plane_coefs = nn.ParameterList()

        self.coo_combs = [[0, 1]]
        for coo_comb in self.coo_combs:
            new_plane_coef = nn.Parameter(
                torch.empty(
                    [
                        self.feat_dim,
                        resolutions[coo_comb[1]],
                        resolutions[coo_comb[0]],
                    ]
                )
            )

            # when init to ones?

            nn.init.uniform_(new_plane_coef, a=init_a, b=init_b)
            self.plane_coefs.append(new_plane_coef)

    def forward(self, inp: Float[Tensor, "*bs 2"]):

        for ci, coo_comb in enumerate(self.coo_combs):
            grid = self.plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(self.feat_dim, -1).T  # [flattened_bs, output_dim]

            output = interp

        return output

    def compute_plane_tv(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_tv(plane_coef)

        return ret_loss


class TemporalNeRFEncoding(nn.Module):
    def __init__(
        self,
        in_dim,  # : int,
        num_frequencies: int,
        min_freq_exp: float,
        max_freq_exp: float,
        log_scale: bool = False,
        include_input: bool = False,
    ) -> None:
        super().__init__()
        self.in_dim = in_dim
        self.num_frequencies = num_frequencies
        self.min_freq = min_freq_exp
        self.max_freq = max_freq_exp
        self.log_scale = log_scale
        self.include_input = include_input

    def get_out_dim(self) -> int:
        if self.in_dim is None:
            raise ValueError("Input dimension has not been set")
        out_dim = self.in_dim * self.num_frequencies * 2
        if self.include_input:
            out_dim += self.in_dim
        return out_dim

    def forward(
        self,
        in_tensor: Float[Tensor, "*bs input_dim"],
    ) -> Float[Tensor, "*bs output_dim"]:
        """Calculates NeRF encoding. If covariances are provided the encodings will be integrated as proposed
            in mip-NeRF.

        Args:
            in_tensor: For best performance, the input tensor should be between 0 and 1.
            covs: Covariances of input points.
        Returns:
            Output values will be between -1 and 1
        """
        scaled_in_tensor = 2 * torch.pi * in_tensor  # scale to [0, 2pi]

        # freqs = 2 ** torch.linspace(
        freqs = torch.linspace(
            self.min_freq, self.max_freq, self.num_frequencies, device=in_tensor.device
        )
        if self.log_scale:
            freqs = 2**freqs
        scaled_inputs = (
            scaled_in_tensor[..., None] * freqs
        )  # [..., "input_dim", "num_scales"]
        scaled_inputs = scaled_inputs.view(
            *scaled_inputs.shape[:-2], -1
        )  # [..., "input_dim" * "num_scales"]

        encoded_inputs = torch.sin(
            torch.cat([scaled_inputs, scaled_inputs + torch.pi / 2.0], dim=-1)
        )
        return encoded_inputs


================================================
FILE: physdreamer/field_components/mlp.py
================================================
"""
Mostly from nerfstudio: https://github.com/nerfstudio-project/nerfstudio/blob/main/nerfstudio/field_components/mlp.py
"""
from typing import Optional, Set, Tuple, Union

import torch
from jaxtyping import Float
from torch import Tensor, nn


class MLP(nn.Module):
    def __init__(
        self,
        in_dim: int,
        num_layers: int,
        layer_width: int,
        out_dim: Optional[int] = None,
        skip_connections: Optional[Tuple[int]] = None,
        activation: Optional[nn.Module] = nn.ReLU(),
        out_activation: Optional[nn.Module] = None,
        zero_init = False,
    ) -> None:
        super().__init__()
        self.in_dim = in_dim
        assert self.in_dim > 0
        self.out_dim = out_dim if out_dim is not None else layer_width
        self.num_layers = num_layers
        self.layer_width = layer_width
        self.skip_connections = skip_connections
        self._skip_connections: Set[int] = (
            set(skip_connections) if skip_connections else set()
        )
        self.activation = activation
        self.out_activation = out_activation
        self.net = None
        self.zero_init = zero_init

        self.build_nn_modules()

    def build_nn_modules(self) -> None:
        """Initialize multi-layer perceptron."""
        layers = []
        if self.num_layers == 1:
            layers.append(nn.Linear(self.in_dim, self.out_dim))
        else:
            for i in range(self.num_layers - 1):
                if i == 0:
                    assert (
                        i not in self._skip_connections
                    ), "Skip connection at layer 0 doesn't make sense."
                    layers.append(nn.Linear(self.in_dim, self.layer_width))
                elif i in self._skip_connections:
                    layers.append(
                        nn.Linear(self.layer_width + self.in_dim, self.layer_width)
                    )
                else:
                    layers.append(nn.Linear(self.layer_width, self.layer_width))
            layers.append(nn.Linear(self.layer_width, self.out_dim))
        self.layers = nn.ModuleList(layers)

        if self.zero_init:
            torch.nn.init.zeros_(self.layers[-1].weight)
            torch.nn.init.zeros_(self.layers[-1].bias)

    def pytorch_fwd(
        self, in_tensor: Float[Tensor, "*bs in_dim"]
    ) -> Float[Tensor, "*bs out_dim"]:
        """Process input with a multilayer perceptron.

        Args:
            in_tensor: Network input

        Returns:
            MLP network output
        """
        x = in_tensor
        for i, layer in enumerate(self.layers):
            # as checked in `build_nn_modules`, 0 should not be in `_skip_connections`
            if i in self._skip_connections:
                x = torch.cat([in_tensor, x], -1)
            x = layer(x)
            if self.activation is not None and i < len(self.layers) - 1:
                x = self.activation(x)
        if self.out_activation is not None:
            x = self.out_activation(x)
        return x

    def forward(
        self, in_tensor: Float[Tensor, "*bs in_dim"]
    ) -> Float[Tensor, "*bs out_dim"]:
        return self.pytorch_fwd(in_tensor)


================================================
FILE: physdreamer/fields/mul_offset_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple, List
from physdreamer.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from physdreamer.field_components.mlp import MLP
from physdreamer.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from physdreamer.data.scene_box import SceneBox


class MulTemporalKplanesOffsetfields(nn.Module):
    """Multiple Temporal Kplanes SE(3) fields.

        Decoder is shared, but plane coefs are different.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions_list: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        add_spatial_triplane: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = 3

        self.temporal_kplanes_encoding_list = nn.ModuleList(
            [
                TemporalKplanesEncoding(resolutions, feat_dim, init_a, init_b, reduce)
                for resolutions in resolutions_list
            ]
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding_list = nn.ModuleList(
                [
                    TriplanesEncoding(
                        resolutions[:-1], feat_dim, init_a, init_b, reduce
                    )
                    for resolutions in resolutions_list
                ]
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 4"], dataset_indx: Int[Tensor, "1"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)

        # for loop in batch dimension

        output = self.temporal_kplanes_encoding_list[dataset_indx](inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding_list[dataset_indx](inp)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        return output

    def compute_smoothess_loss(
        self,
    ):
        temporal_smoothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            temporal_smoothness_loss += (
                temporal_kplanes_encoding.compute_temporal_smoothness()
            )

        smothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            smothness_loss += temporal_kplanes_encoding.compute_plane_tv()

        if self.add_spatial_triplane:
            for spatial_kplanes_encoding in self.spatial_kplanes_encoding_list:
                smothness_loss += spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss, temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        output = self(inp)

        rec_traj = inpx + output

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss

    def arap_loss(self, inp):
        pass


================================================
FILE: physdreamer/fields/mul_se3_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple
from physdreamer.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from physdreamer.field_components.mlp import MLP
from physdreamer.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from physdreamer.data.scene_box import SceneBox


class MulTemporalKplanesSE3fields(nn.Module):
    """Multiple Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions_list: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        rotation_type: Literal["quaternion", "6d"] = "6d",
        add_spatial_triplane: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        output_dim_dict = {"quaternion": 4 + 3, "6d": 6 + 3}
        self.output_dim = output_dim_dict[rotation_type]
        self.rotation_type = rotation_type

        self.temporal_kplanes_encoding_list = nn.ModuleList(
            [
                TemporalKplanesEncoding(resolutions, feat_dim, init_a, init_b, reduce)
                for resolutions in resolutions_list
            ]
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding_list = nn.ModuleList(
                [
                    TriplanesEncoding(
                        resolutions[:-1], feat_dim, init_a, init_b, reduce
                    )
                    for resolutions in resolutions_list
                ]
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 4"], dataset_indx: Int[Tensor, "1"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)

        # for loop in batch dimension

        output = self.temporal_kplanes_encoding_list[dataset_indx](inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding_list[dataset_indx](inp)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        if self.rotation_type == "6d":
            rotation_6d, translation = output[:, :6], output[:, 6:]
            R_mat = rotation_6d_to_matrix(rotation_6d)

        elif self.rotation_type == "quaternion":
            quat, translation = output[:, :4], output[:, 4:]

            # tanh and normalize
            quat = torch.tanh(quat)

            R_mat = quaternion_to_matrix(quat)

        return R_mat, translation

    def compute_smoothess_loss(
        self,
    ):
        temporal_smoothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            temporal_smoothness_loss += (
                temporal_kplanes_encoding.compute_temporal_smoothness()
            )

        smothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            smothness_loss += temporal_kplanes_encoding.compute_plane_tv()

        if self.add_spatial_triplane:
            for spatial_kplanes_encoding in self.spatial_kplanes_encoding_list:
                smothness_loss += spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss, temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        R, t = self(inp)

        rec_traj = torch.bmm(R, inpx.unsqueeze(-1)).squeeze(-1) + t

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss


================================================
FILE: physdreamer/fields/offset_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple, List
from physdreamer.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from physdreamer.field_components.mlp import MLP
from physdreamer.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from physdreamer.data.scene_box import SceneBox


class TemporalKplanesOffsetfields(nn.Module):
    """Temporal Offsets fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        add_spatial_triplane: bool = True,
        zero_init: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = 3

        self.temporal_kplanes_encoding = TemporalKplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding = TriplanesEncoding(
                resolutions[:-1], feat_dim, init_a, init_b, reduce
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 4"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)
        output = self.temporal_kplanes_encoding(inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding(inpx)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        return output

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.temporal_kplanes_encoding.compute_plane_tv()
        temporal_smoothness_loss = (
            self.temporal_kplanes_encoding.compute_temporal_smoothness()
        )

        if self.add_spatial_triplane:
            smothness_loss += self.spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss + temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        output = self(inp)

        rec_traj = inpx + output

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss

    def arap_loss(self, inp):
        pass

    def forward_with_plane_coefs(
        self,
        plane_coefs: List[Float[Tensor, "feat_dim H W"]],
        inp: Float[Tensor, "*bs 4"],
    ):
        """
        Args:
            pass
        """

        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)
        output = self.temporal_kplanes_encoding.functional_forward(
            plane_coefs, inp, reduce=self.temporal_kplanes_encoding.reduce
        )

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding(inpx)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        return output


================================================
FILE: physdreamer/fields/se3_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple
from physdreamer.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from physdreamer.field_components.mlp import MLP
from physdreamer.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from physdreamer.data.scene_box import SceneBox


class TemporalKplanesSE3fields(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        rotation_type: Literal["quaternion", "6d"] = "6d",
        add_spatial_triplane: bool = True,
        zero_init: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        output_dim_dict = {"quaternion": 4 + 3, "6d": 6 + 3}
        self.output_dim = output_dim_dict[rotation_type]
        self.rotation_type = rotation_type

        self.temporal_kplanes_encoding = TemporalKplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding = TriplanesEncoding(
                resolutions[:-1], feat_dim, init_a, init_b, reduce
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self,
        inp: Float[Tensor, "*bs 4"],
        compute_smoothess_loss: bool = False,
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        if compute_smoothess_loss:
            smothness_loss, temporal_smoothness_loss = self.compute_smoothess_loss()
            return smothness_loss + temporal_smoothness_loss
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)
        output = self.temporal_kplanes_encoding(inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding(inpx)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        if self.rotation_type == "6d":
            rotation_6d, translation = output[:, :6], output[:, 6:]
            R_mat = rotation_6d_to_matrix(rotation_6d)

        elif self.rotation_type == "quaternion":
            quat, translation = output[:, :4], output[:, 4:]

            # tanh and normalize
            quat = torch.tanh(quat)

            R_mat = quaternion_to_matrix(quat)

            # --------------- remove below --------------- #
            # add normalization
            # r = quat
            # norm = torch.sqrt(
            #     r[:, 0] * r[:, 0]
            #     + r[:, 1] * r[:, 1]
            #     + r[:, 2] * r[:, 2]
            #     + r[:, 3] * r[:, 3]
            # )
            # q = r / norm[:, None]
            # R_mat = q
            # --------------- remove above --------------- #

        return R_mat, translation

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.temporal_kplanes_encoding.compute_plane_tv()
        temporal_smoothness_loss = (
            self.temporal_kplanes_encoding.compute_temporal_smoothness()
        )

        if self.add_spatial_triplane:
            smothness_loss += self.spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss, temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        R, t = self(inp)

        rec_traj = torch.bmm(R, inpx.unsqueeze(-1)).squeeze(-1) + t

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss


================================================
FILE: physdreamer/fields/triplane_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Optional, Sequence, Tuple, List
from physdreamer.field_components.encoding import TriplanesEncoding
from physdreamer.field_components.mlp import MLP
from physdreamer.data.scene_box import SceneBox


class TriplaneFields(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z]

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  #: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        output_dim: int = 96,
        zero_init: bool = False,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = output_dim

        self.kplanes_encoding = TriplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        if reduce == "cat":
            feat_dim = feat_dim * 3
        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 3"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        output = self.kplanes_encoding(inpx)

        output = self.decoder(output)

        # split_size = output.shape[-1] // 3
        # output = torch.stack(torch.split(output, split_size, dim=-1), dim=-1)

        return output

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.kplanes_encoding.compute_plane_tv()

        return smothness_loss


def compute_entropy(p):
    return -torch.sum(
        p * torch.log(p + 1e-5), dim=1
    ).mean()  # Adding a small constant to prevent log(0)


class TriplaneFieldsWithEntropy(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z]

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  #: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        output_dim: int = 96,
        zero_init: bool = False,
        num_cls: int = 3,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = output_dim
        self.num_cls = num_cls

        self.kplanes_encoding = TriplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.num_cls,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

        self.cls_embedding = torch.nn.Embedding(num_cls, output_dim)

    def forward(
        self, inp: Float[Tensor, "*bs 3"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "1"]]:
        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        output = self.kplanes_encoding(inpx)

        output = self.decoder(output)

        prob = F.softmax(output, dim=-1)

        entropy = compute_entropy(prob)

        cls_index = torch.tensor([0, 1, 2]).to(inp.device)
        cls_emb = self.cls_embedding(cls_index)

        output = torch.matmul(prob, cls_emb)

        return output, entropy

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.kplanes_encoding.compute_plane_tv()

        return smothness_loss


================================================
FILE: physdreamer/gaussian_3d/README.md
================================================
This folder is mainly a copy paste from https://github.com/graphdeco-inria/gaussian-splatting

We add some function to render the applied external force. 

================================================
FILE: physdreamer/gaussian_3d/arguments/__init__.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

from argparse import ArgumentParser, Namespace
import sys
import os

class GroupParams:
    pass

class ParamGroup:
    def __init__(self, parser: ArgumentParser, name : str, fill_none = False):
        group = parser.add_argument_group(name)
        for key, value in vars(self).items():
            shorthand = False
            if key.startswith("_"):
                shorthand = True
                key = key[1:]
            t = type(value)
            value = value if not fill_none else None 
            if shorthand:
                if t == bool:
                    group.add_argument("--" + key, ("-" + key[0:1]), default=value, action="store_true")
                else:
                    group.add_argument("--" + key, ("-" + key[0:1]), default=value, type=t)
            else:
                if t == bool:
                    group.add_argument("--" + key, default=value, action="store_true")
                else:
                    group.add_argument("--" + key, default=value, type=t)

    def extract(self, args):
        group = GroupParams()
        for arg in vars(args).items():
            if arg[0] in vars(self) or ("_" + arg[0]) in vars(self):
                setattr(group, arg[0], arg[1])
        return group

class ModelParams(ParamGroup): 
    def __init__(self, parser, sentinel=False):
        self.sh_degree = 3
        self._source_path = ""
        self._model_path = ""
        self._images = "images"
        self._resolution = -1
        self._white_background = False
        self.data_device = "cuda"
        self.eval = False
        super().__init__(parser, "Loading Parameters", sentinel)

    def extract(self, args):
        g = super().extract(args)
        g.source_path = os.path.abspath(g.source_path)
        return g

class PipelineParams(ParamGroup):
    def __init__(self, parser):
        self.convert_SHs_python = False
        self.compute_cov3D_python = False
        self.debug = False
        super().__init__(parser, "Pipeline Parameters")

class OptimizationParams(ParamGroup):
    def __init__(self, parser):
        self.iterations = 30_000
        self.position_lr_init = 0.00016
        self.position_lr_final = 0.0000016
        self.position_lr_delay_mult = 0.01
        self.position_lr_max_steps = 30_000
        self.feature_lr = 0.0025
        self.opacity_lr = 0.05
        self.scaling_lr = 0.005
        self.rotation_lr = 0.001
        self.percent_dense = 0.01
        self.lambda_dssim = 0.2
        self.densification_interval = 100
        self.opacity_reset_interval = 3000
        self.densify_from_iter = 500
        self.densify_until_iter = 15_000
        self.densify_grad_threshold = 0.0002
        super().__init__(parser, "Optimization Parameters")

def get_combined_args(parser : ArgumentParser):
    cmdlne_string = sys.argv[1:]
    cfgfile_string = "Namespace()"
    args_cmdline = parser.parse_args(cmdlne_string)

    try:
        cfgfilepath = os.path.join(args_cmdline.model_path, "cfg_args")
        print("Looking for config file in", cfgfilepath)
        with open(cfgfilepath) as cfg_file:
            print("Config file found: {}".format(cfgfilepath))
            cfgfile_string = cfg_file.read()
    except TypeError:
        print("Config file not found at")
        pass
    args_cfgfile = eval(cfgfile_string)

    merged_dict = vars(args_cfgfile).copy()
    for k,v in vars(args_cmdline).items():
        if v != None:
            merged_dict[k] = v
    return Namespace(**merged_dict)


================================================
FILE: physdreamer/gaussian_3d/gaussian_renderer/__init__.py
================================================


================================================
FILE: physdreamer/gaussian_3d/gaussian_renderer/depth_uv_render.py
================================================
import torch
from physdreamer.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from typing import Callable


def render_uv_depth_w_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_disp: [N, 3]
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.

    shs = None
    colors_precomp = None

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]

    R = w2c[:3, :3].unsqueeze(0)  # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0)  # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([pc._xyz, torch.ones_like(pc._xyz[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1)  # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0]  # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())

    # [N, 2]
    pts_cam_xy = pts_cam[:, :2, 0] / depth.unsqueeze(-1)

    pts_cam_xy_pixel = cam_plane_2_img.unsqueeze(0) @ pts_cam_xy.unsqueeze(
        -1
    )  # [N, 2, 1]
    pts_cam_xy_pixel = pts_cam_xy_pixel.squeeze(-1)  # [N, 2]

    colors_precomp = torch.cat(
        [pts_cam_xy_pixel, depth.unsqueeze(dim=-1)], dim=-1
    )  # [N, 3]

    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    return {
        "render": rendered_image,
        "visibility_filter": radii > 0,
        "radii": radii,
        "pts_depth": depth,
        "pts_cam_xy_pixel": pts_cam_xy_pixel,
    }


================================================
FILE: physdreamer/gaussian_3d/gaussian_renderer/feat_render.py
================================================
import torch
from physdreamer.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from typing import Callable


def render_feat_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    bg_color: torch.Tensor,
    points_feat: torch.Tensor,
    scaling_modifier=1.0,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_disp: [N, 3]
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.

    shs = None
    colors_precomp = points_feat
    assert (points_feat.shape[1] == 3) and (points_feat.shape[0] == means3D.shape[0])

    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    return {
        "render": rendered_image,
        "visibility_filter": radii > 0,
        "radii": radii,
    }


================================================
FILE: physdreamer/gaussian_3d/gaussian_renderer/flow_depth_render.py
================================================
import torch
from physdreamer.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from typing import Callable


def render_flow_depth_w_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    point_disp: torch.Tensor,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_disp: [N, 3]
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    shs = None
    colors_precomp = None

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]

    R = w2c[:3, :3].unsqueeze(0)  # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0)  # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([pc._xyz, torch.ones_like(pc._xyz[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1)  # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0]  # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())

    point_disp_pad = torch.cat(
        [point_disp, torch.zeros_like(point_disp[:, 0:1])], dim=-1
    )  # [N, 4]

    pts_motion = w2c.unsqueeze(0) @ point_disp_pad.unsqueeze(-1)  # [N, 4, 1]

    # [N, 2]
    pts_motion_xy = pts_motion[:, :2, 0] / depth.unsqueeze(-1)

    pts_motion_xy_pixel = cam_plane_2_img.unsqueeze(0) @ pts_motion_xy.unsqueeze(
        -1
    )  # [N, 2, 1]
    pts_motion_xy_pixel = pts_motion_xy_pixel.squeeze(-1)  # [N, 2]

    colors_precomp = torch.cat(
        [pts_motion_xy_pixel, depth.unsqueeze(dim=-1)], dim=-1
    )  # [N, 3]

    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    # return {
    #     "render": rendered_image,
    #     "viewspace_points": screenspace_points,
    #     "visibility_filter": radii > 0,
    #     "radii": radii,
    # }

    return {"render": rendered_image}


================================================
FILE: physdreamer/gaussian_3d/gaussian_renderer/render.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import math
from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from physdreamer.gaussian_3d.scene.gaussian_model import GaussianModel


def render_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
    override_color=None,
    cov3D_precomp=None,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None

    if pipe.compute_cov3D_python or cov3D_precomp is None:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    elif cov3D_precomp is None:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    shs = None
    colors_precomp = None
    if override_color is None:
        if pipe.convert_SHs_python:
            shs_view = pc.get_features.transpose(1, 2).view(
                -1, 3, (pc.max_sh_degree + 1) ** 2
            )
            dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat(
                pc.get_features.shape[0], 1
            )
            dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True)
            sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
            colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
        else:
            shs = pc.get_features
    else:
        colors_precomp = override_color

    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.
    return {
        "render": rendered_image,
        "viewspace_points": screenspace_points,
        "visibility_filter": radii > 0,
        "radii": radii,
    }
    # return {"render": rendered_image}


def gaussian_intrin_scale(x_or_y: torch.Tensor, w_or_h: float):

    ret = ((x_or_y + 1.0) * w_or_h - 1.0) * 0.5

    return ret


def render_arrow_in_screen(viewpoint_camera, points_3d):

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]
    cam_plane_2_img = viewpoint_camera.projection_matrix.transpose(0, 1)  # [4, 4]

    full_proj_mat = viewpoint_camera.full_proj_transform

    # [N, 4]
    pts = torch.cat([points_3d, torch.ones_like(points_3d[:, 0:1])], dim=-1)
    # [N, 1, 4] <-  [N, 1, 4] @ [1, 4, 4]
    pts_cam = pts.unsqueeze(-2) @ full_proj_mat.unsqueeze(0)  # [N, 1, 4]

    # start here

    # pts: [N, 4]
    # [1, 4, 4] @ [N, 4, 1] -> [N, 4, 1]
    # from IPython import embed

    # embed()
    # pts_cam = torch.bmm(
    #     full_proj_mat.T.unsqueeze(0), pts.unsqueeze(-1)
    # )  # K*[R,T]*[x,y,z,1]^T to get 2D projection of Gaussians
    # end here
    pts_cam = full_proj_mat.T.unsqueeze(0) @ pts.unsqueeze(-1)

    # print(pts_cam.shape)

    pts_cam = pts_cam.squeeze(-1)  # [N, 4]
    pts_cam = pts_cam[:, :3] / pts_cam[:, 3:]  # [N, 1, 3]

    # print(pts_cam, "after proj")

    pts_cam_yx_pixel = pts_cam[:, :2]
    #  [N, 2] yx => xy
    # pts_cam_xy_pixel = torch.cat(
    #     [pts_cam_xy_pixel[:, [1]], pts_cam_xy_pixel[:, [0]]], dim=-1
    # )

    pts_cam_x, pts_cam_y = pts_cam_yx_pixel[:, 0], pts_cam_yx_pixel[:, 1]

    w, h = viewpoint_camera.image_width, viewpoint_camera.image_height

    pts_cam_x = gaussian_intrin_scale(pts_cam_x, w)
    pts_cam_y = gaussian_intrin_scale(pts_cam_y, h)

    ret_pts_cam_xy = torch.cat(
        [pts_cam_x.unsqueeze(-1), pts_cam_y.unsqueeze(-1)], dim=-1
    )

    # print(ret_pts_cam_xy)

    return ret_pts_cam_xy


def render_arrow_in_screen_back(viewpoint_camera, points_3d):

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]
    cam_plane_2_img = viewpoint_camera.projection_matrix.transpose(0, 1)

    from IPython import embed

    embed()

    R = w2c[:3, :3].unsqueeze(0)  # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0)  # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([points_3d, torch.ones_like(points_3d[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1)  # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0]  # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())

    # [N, 2]
    pts_cam_xy = pts_cam[:, :2, 0] / depth.unsqueeze(-1)

    pts_cam_xy_pixel = cam_plane_2_img.unsqueeze(0) @ pts_cam_xy.unsqueeze(
        -1
    )  # [N, 2, 1]
    pts_cam_xy_pixel = pts_cam_xy_pixel.squeeze(-1)  # [N, 2]

    #  [N, 2] yx => xy
    pts_cam_xy_pixel = torch.cat(
        [pts_cam_xy_pixel[:, [1]], pts_cam_xy_pixel[:, [0]]], dim=-1
    )

    return pts_cam_xy_pixel


# for spherecal harmonics


C0 = 0.28209479177387814
C1 = 0.4886025119029199
C2 = [
    1.0925484305920792,
    -1.0925484305920792,
    0.31539156525252005,
    -1.0925484305920792,
    0.5462742152960396,
]
C3 = [
    -0.5900435899266435,
    2.890611442640554,
    -0.4570457994644658,
    0.3731763325901154,
    -0.4570457994644658,
    1.445305721320277,
    -0.5900435899266435,
]
C4 = [
    2.5033429417967046,
    -1.7701307697799304,
    0.9461746957575601,
    -0.6690465435572892,
    0.10578554691520431,
    -0.6690465435572892,
    0.47308734787878004,
    -1.7701307697799304,
    0.6258357354491761,
]


def eval_sh(deg, sh, dirs):
    """
    Evaluate spherical harmonics at unit directions
    using hardcoded SH polynomials.
    Works with torch/np/jnp.
    ... Can be 0 or more batch dimensions.
    Args:
        deg: int SH deg. Currently, 0-3 supported
        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
        dirs: jnp.ndarray unit directions [..., 3]
    Returns:
        [..., C]
    """
    assert deg <= 4 and deg >= 0
    coeff = (deg + 1) ** 2
    assert sh.shape[-1] >= coeff

    result = C0 * sh[..., 0]
    if deg > 0:
        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
        result = (
            result - C1 * y * sh[..., 1] + C1 * z * sh[..., 2] - C1 * x * sh[..., 3]
        )

        if deg > 1:
            xx, yy, zz = x * x, y * y, z * z
            xy, yz, xz = x * y, y * z, x * z
            result = (
                result
                + C2[0] * xy * sh[..., 4]
                + C2[1] * yz * sh[..., 5]
                + C2[2] * (2.0 * zz - xx - yy) * sh[..., 6]
                + C2[3] * xz * sh[..., 7]
                + C2[4] * (xx - yy) * sh[..., 8]
            )

            if deg > 2:
                result = (
                    result
                    + C3[0] * y * (3 * xx - yy) * sh[..., 9]
                    + C3[1] * xy * z * sh[..., 10]
                    + C3[2] * y * (4 * zz - xx - yy) * sh[..., 11]
                    + C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12]
                    + C3[4] * x * (4 * zz - xx - yy) * sh[..., 13]
                    + C3[5] * z * (xx - yy) * sh[..., 14]
                    + C3[6] * x * (xx - 3 * yy) * sh[..., 15]
                )

                if deg > 3:
                    result = (
                        result
                        + C4[0] * xy * (xx - yy) * sh[..., 16]
                        + C4[1] * yz * (3 * xx - yy) * sh[..., 17]
                        + C4[2] * xy * (7 * zz - 1) * sh[..., 18]
                        + C4[3] * yz * (7 * zz - 3) * sh[..., 19]
                        + C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20]
                        + C4[5] * xz * (7 * zz - 3) * sh[..., 21]
                        + C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22]
                        + C4[7] * xz * (xx - 3 * yy) * sh[..., 23]
                        + C4[8]
                        * (xx * (xx - 3 * yy) - yy * (3 * xx - yy))
                        * sh[..., 24]
                    )
    return result


def RGB2SH(rgb):
    return (rgb - 0.5) / C0


def SH2RGB(sh):
    return sh * C0 + 0.5


================================================
FILE: physdreamer/gaussian_3d/scene/__init__.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import os
import random
import numpy as np
import json
from physdreamer.gaussian_3d.utils.system_utils import searchForMaxIteration
from physdreamer.gaussian_3d.scene.dataset_readers import sceneLoadTypeCallbacks
from physdreamer.gaussian_3d.scene.gaussian_model import GaussianModel
from physdreamer.gaussian_3d.arguments import ModelParams
from physdreamer.gaussian_3d.utils.camera_utils import (
    cameraList_from_camInfos,
    camera_to_JSON,
)


class Scene:
    gaussians: GaussianModel

    def __init__(
        self,
        args: ModelParams,
        gaussians: GaussianModel,
        load_iteration=None,
        shuffle=True,
        resolution_scales=[1.0],
    ):
        """b
        :param path: Path to colmap scene main folder.
        """
        self.model_path = args.model_path
        self.loaded_iter = None
        self.gaussians = gaussians

        if load_iteration:
            if load_iteration == -1:
                self.loaded_iter = searchForMaxIteration(
                    os.path.join(self.model_path, "point_cloud")
                )
            else:
                self.loaded_iter = load_iteration
            print("Loading trained model at iteration {}".format(self.loaded_iter))

        self.train_cameras = {}
        self.test_cameras = {}

        if os.path.exists(os.path.join(args.source_path, "sparse")):
            scene_info = sceneLoadTypeCallbacks["Colmap"](
                args.source_path, args.images, args.eval
            )
        elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")):
            print("Found transforms_train.json file, assuming Blender data set!")
            scene_info = sceneLoadTypeCallbacks["Blender"](
                args.source_path, args.white_background, args.eval
            )
        else:
            assert False, "Could not recognize scene type!"

        if not self.loaded_iter:
            with open(scene_info.ply_path, "rb") as src_file, open(
                os.path.join(self.model_path, "input.ply"), "wb"
            ) as dest_file:
                dest_file.write(src_file.read())
            json_cams = []
            camlist = []
            if scene_info.test_cameras:
                camlist.extend(scene_info.test_cameras)
            if scene_info.train_cameras:
                camlist.extend(scene_info.train_cameras)
            for id, cam in enumerate(camlist):
                json_cams.append(camera_to_JSON(id, cam))
            with open(os.path.join(self.model_path, "cameras.json"), "w") as file:
                json.dump(json_cams, file)

        if shuffle:
            random.shuffle(
                scene_info.train_cameras
            )  # Multi-res consistent random shuffling
            random.shuffle(
                scene_info.test_cameras
            )  # Multi-res consistent random shuffling

        self.cameras_extent = scene_info.nerf_normalization["radius"]

        for resolution_scale in resolution_scales:
            print("Loading Training Cameras")
            self.train_cameras[resolution_scale] = cameraList_from_camInfos(
                scene_info.train_cameras, resolution_scale, args
            )
            print("Loading Test Cameras")
            self.test_cameras[resolution_scale] = cameraList_from_camInfos(
                scene_info.test_cameras, resolution_scale, args
            )

        if self.loaded_iter:
            self.gaussians.load_ply(
                os.path.join(
                    self.model_path,
                    "point_cloud",
                    "iteration_" + str(self.loaded_iter),
                    "point_cloud.ply",
                )
            )
        else:
            self.gaussians.create_from_pcd(scene_info.point_cloud, self.cameras_extent)

    def save(self, iteration):
        point_cloud_path = os.path.join(
            self.model_path, "point_cloud/iteration_{}".format(iteration)
        )
        self.gaussians.save_ply(os.path.join(point_cloud_path, "point_cloud.ply"))

    def getTrainCameras(self, scale=1.0):
        return self.train_cameras[scale]

    def getTestCameras(self, scale=1.0):
        return self.test_cameras[scale]


================================================
FILE: physdreamer/gaussian_3d/scene/cameras.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
from torch import nn
import numpy as np
from physdreamer.gaussian_3d.utils.graphics_utils import (
    getWorld2View2,
    getProjectionMatrix,
)


class Camera(nn.Module):
    def __init__(
        self,
        colmap_id,
        R,
        T,
        FoVx,
        FoVy,
        image,
        gt_alpha_mask,
        image_name,
        uid,
        trans=np.array([0.0, 0.0, 0.0]),
        scale=1.0,
        data_device="cuda",
    ):
        super(Camera, self).__init__()

        self.uid = uid
        self.colmap_id = colmap_id
        self.R = R
        self.T = T
        self.FoVx = FoVx
        self.FoVy = FoVy
        self.image_name = image_name

        try:
            self.data_device = torch.device(data_device)
        except Exception as e:
            print(e)
            print(
                f"[Warning] Custom device {data_device} failed, fallback to default cuda device"
            )
            self.data_device = torch.device("cuda")

        self.original_image = image.clamp(0.0, 1.0).to(self.data_device)
        self.image_width = self.original_image.shape[2]
        self.image_height = self.original_image.shape[1]

        if gt_alpha_mask is not None:
            self.original_image *= gt_alpha_mask.to(self.data_device)
        else:
            self.original_image *= torch.ones(
                (1, self.image_height, self.image_width), device=self.data_device
            )

        self.zfar = 100.0
        self.znear = 0.01

        self.trans = trans
        self.scale = scale

        self.world_view_transform = (
            torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda()
        )
        self.projection_matrix = (
            getProjectionMatrix(
                znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy
            )
            .transpose(0, 1)
            .cuda()
        )
        self.full_proj_transform = (
            self.world_view_transform.unsqueeze(0).bmm(
                self.projection_matrix.unsqueeze(0)
            )
        ).squeeze(0)
        self.camera_center = self.world_view_transform.inverse()[3, :3]


class MiniCam:
    def __init__(
        self,
        width,
        height,
        fovy,
        fovx,
        znear,
        zfar,
        world_view_transform,
        full_proj_transform,
    ):
        self.image_width = width
        self.image_height = height
        self.FoVy = fovy
        self.FoVx = fovx
        self.znear = znear
        self.zfar = zfar
        self.world_view_transform = world_view_transform
        self.full_proj_transform = full_proj_transform
        view_inv = torch.inverse(self.world_view_transform)
        self.camera_center = view_inv[3][:3]


================================================
FILE: physdreamer/gaussian_3d/scene/colmap_loader.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import numpy as np
import collections
import struct

CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"])
Camera = collections.namedtuple(
    "Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
}
CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
                         for camera_model in CAMERA_MODELS])
CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
                           for camera_model in CAMERA_MODELS])


def qvec2rotmat(qvec):
    return np.array([
        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])

def rotmat2qvec(R):
    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
    K = np.array([
        [Rxx - Ryy - Rzz, 0, 0, 0],
        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
    eigvals, eigvecs = np.linalg.eigh(K)
    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
    if qvec[0] < 0:
        qvec *= -1
    return qvec

class Image(BaseImage):
    def qvec2rotmat(self):
        return qvec2rotmat(self.qvec)

def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
    """Read and unpack the next bytes from a binary file.
    :param fid:
    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    :param endian_character: Any of {@, =, <, >, !}
    :return: Tuple of read and unpacked values.
    """
    data = fid.read(num_bytes)
    return struct.unpack(endian_character + format_char_sequence, data)

def read_points3D_text(path):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DText(const std::string& path)
        void Reconstruction::WritePoints3DText(const std::string& path)
    """
    xyzs = None
    rgbs = None
    errors = None
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                xyz = np.array(tuple(map(float, elems[1:4])))
                rgb = np.array(tuple(map(int, elems[4:7])))
                error = np.array(float(elems[7]))
                if xyzs is None:
                    xyzs = xyz[None, ...]
                    rgbs = rgb[None, ...]
                    errors = error[None, ...]
                else:
                    xyzs = np.append(xyzs, xyz[None, ...], axis=0)
                    rgbs = np.append(rgbs, rgb[None, ...], axis=0)
                    errors = np.append(errors, error[None, ...], axis=0)
    return xyzs, rgbs, errors

def read_points3D_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """


    with open(path_to_model_file, "rb") as fid:
        num_points = read_next_bytes(fid, 8, "Q")[0]

        xyzs = np.empty((num_points, 3))
        rgbs = np.empty((num_points, 3))
        errors = np.empty((num_points, 1))

        for p_id in range(num_points):
            binary_point_line_properties = read_next_bytes(
                fid, num_bytes=43, format_char_sequence="QdddBBBd")
            xyz = np.array(binary_point_line_properties[1:4])
            rgb = np.array(binary_point_line_properties[4:7])
            error = np.array(binary_point_line_properties[7])
            track_length = read_next_bytes(
                fid, num_bytes=8, format_char_sequence="Q")[0]
            track_elems = read_next_bytes(
                fid, num_bytes=8*track_length,
                format_char_sequence="ii"*track_length)
            xyzs[p_id] = xyz
            rgbs[p_id] = rgb
            errors[p_id] = error
    return xyzs, rgbs, errors

def read_intrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    cameras = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                camera_id = int(elems[0])
                model = elems[1]
                assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE"
                width = int(elems[2])
                height = int(elems[3])
                params = np.array(tuple(map(float, elems[4:])))
                cameras[camera_id] = Camera(id=camera_id, model=model,
                                            width=width, height=height,
                                            params=params)
    return cameras

def read_extrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    images = {}
    with open(path_to_model_file, "rb") as fid:
        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_reg_images):
            binary_image_properties = read_next_bytes(
                fid, num_bytes=64, format_char_sequence="idddddddi")
            image_id = binary_image_properties[0]
            qvec = np.array(binary_image_properties[1:5])
            tvec = np.array(binary_image_properties[5:8])
            camera_id = binary_image_properties[8]
            image_name = ""
            current_char = read_next_bytes(fid, 1, "c")[0]
            while current_char != b"\x00":   # look for the ASCII 0 entry
                image_name += current_char.decode("utf-8")
                current_char = read_next_bytes(fid, 1, "c")[0]
            num_points2D = read_next_bytes(fid, num_bytes=8,
                                           format_char_sequence="Q")[0]
            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
                                       format_char_sequence="ddq"*num_points2D)
            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
                                   tuple(map(float, x_y_id_s[1::3]))])
            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
            images[image_id] = Image(
                id=image_id, qvec=qvec, tvec=tvec,
                camera_id=camera_id, name=image_name,
                xys=xys, point3D_ids=point3D_ids)
    return images


def read_intrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    cameras = {}
    with open(path_to_model_file, "rb") as fid:
        num_cameras = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_cameras):
            camera_properties = read_next_bytes(
                fid, num_bytes=24, format_char_sequence="iiQQ")
            camera_id = camera_properties[0]
            model_id = camera_properties[1]
            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
            width = camera_properties[2]
            height = camera_properties[3]
            num_params = CAMERA_MODEL_IDS[model_id].num_params
            params = read_next_bytes(fid, num_bytes=8*num_params,
                                     format_char_sequence="d"*num_params)
            cameras[camera_id] = Camera(id=camera_id,
                                        model=model_name,
                                        width=width,
                                        height=height,
                                        params=np.array(params))
        assert len(cameras) == num_cameras
    return cameras


def read_extrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    images = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                image_id = int(elems[0])
                qvec = np.array(tuple(map(float, elems[1:5])))
                tvec = np.array(tuple(map(float, elems[5:8])))
                camera_id = int(elems[8])
                image_name = elems[9]
                elems = fid.readline().split()
                xys = np.column_stack([tuple(map(float, elems[0::3])),
                                       tuple(map(float, elems[1::3]))])
                point3D_ids = np.array(tuple(map(int, elems[2::3])))
                images[image_id] = Image(
                    id=image_id, qvec=qvec, tvec=tvec,
                    camera_id=camera_id, name=image_name,
                    xys=xys, point3D_ids=point3D_ids)
    return images


def read_colmap_bin_array(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py

    :param path: path to the colmap binary file.
    :return: nd array with the floating point values in the value
    """
    with open(path, "rb") as fid:
        width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1,
                                                usecols=(0, 1, 2), dtype=int)
        fid.seek(0)
        num_delimiter = 0
        byte = fid.read(1)
        while True:
            if byte == b"&":
                num_delimiter += 1
                if num_delimiter >= 3:
                    break
            byte = fid.read(1)
        array = np.fromfile(fid, np.float32)
    array = array.reshape((width, height, channels), order="F")
    return np.transpose(array, (1, 0, 2)).squeeze()


================================================
FILE: physdreamer/gaussian_3d/scene/dataset_readers.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import os
import sys
from PIL import Image
from typing import NamedTuple
from physdreamer.gaussian_3d.scene.colmap_loader import (
    read_extrinsics_text,
    read_intrinsics_text,
    qvec2rotmat,
    read_extrinsics_binary,
    read_intrinsics_binary,
    read_points3D_binary,
    read_points3D_text,
)
from physdreamer.gaussian_3d.utils.graphics_utils import (
    getWorld2View2,
    focal2fov,
    fov2focal,
)
import numpy as np
import math
import json
from pathlib import Path
from plyfile import PlyData, PlyElement
from physdreamer.gaussian_3d.utils.sh_utils import SH2RGB
from physdreamer.gaussian_3d.scene.gaussian_model import BasicPointCloud
import torch
import torch.nn as nn
from physdreamer.gaussian_3d.utils.graphics_utils import (
    getWorld2View2,
    getProjectionMatrix,
)


class CameraInfo(NamedTuple):
    uid: int
    R: np.array
    T: np.array
    FovY: np.array
    FovX: np.array
    image: np.array
    image_path: str
    image_name: str
    width: int
    height: int


class SceneInfo(NamedTuple):
    point_cloud: BasicPointCloud
    train_cameras: list
    test_cameras: list
    nerf_normalization: dict
    ply_path: str


def getNerfppNorm(cam_info):
    def get_center_and_diag(cam_centers):
        cam_centers = np.hstack(cam_centers)
        avg_cam_center = np.mean(cam_centers, axis=1, keepdims=True)
        center = avg_cam_center
        dist = np.linalg.norm(cam_centers - center, axis=0, keepdims=True)
        diagonal = np.max(dist)
        return center.flatten(), diagonal

    cam_centers = []

    for cam in cam_info:
        W2C = getWorld2View2(cam.R, cam.T)
        C2W = np.linalg.inv(W2C)
        cam_centers.append(C2W[:3, 3:4])

    center, diagonal = get_center_and_diag(cam_centers)
    radius = diagonal * 1.1

    translate = -center

    return {"translate": translate, "radius": radius}


def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
    cam_infos = []
    for idx, key in enumerate(cam_extrinsics):
        sys.stdout.write("\r")
        # the exact output you're looking for:
        sys.stdout.write("Reading camera {}/{}".format(idx + 1, len(cam_extrinsics)))
        sys.stdout.flush()

        extr = cam_extrinsics[key]
        intr = cam_intrinsics[extr.camera_id]
        height = intr.height
        width = intr.width

        uid = intr.id
        R = np.transpose(qvec2rotmat(extr.qvec))
        T = np.array(extr.tvec)

        if intr.model == "SIMPLE_PINHOLE":
            focal_length_x = intr.params[0]
            FovY = focal2fov(focal_length_x, height)
            FovX = focal2fov(focal_length_x, width)
        elif intr.model == "PINHOLE":
            focal_length_x = intr.params[0]
            focal_length_y = intr.params[1]
            FovY = focal2fov(focal_length_y, height)
            FovX = focal2fov(focal_length_x, width)
        else:
            assert (
                False
            ), "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!"

        image_path = os.path.join(images_folder, os.path.basename(extr.name))
        image_name = os.path.basename(image_path).split(".")[0]
        image = Image.open(image_path)

        cam_info = CameraInfo(
            uid=uid,
            R=R,
            T=T,
            FovY=FovY,
            FovX=FovX,
            image=image,
            image_path=image_path,
            image_name=image_name,
            width=width,
            height=height,
        )
        cam_infos.append(cam_info)
    sys.stdout.write("\n")
    return cam_infos


def fetchPly(path):
    plydata = PlyData.read(path)
    vertices = plydata["vertex"]
    positions = np.vstack([vertices["x"], vertices["y"], vertices["z"]]).T
    colors = np.vstack([vertices["red"], vertices["green"], vertices["blue"]]).T / 255.0
    normals = np.vstack([vertices["nx"], vertices["ny"], vertices["nz"]]).T
    return BasicPointCloud(points=positions, colors=colors, normals=normals)


def storePly(path, xyz, rgb):
    # Define the dtype for the structured array
    dtype = [
        ("x", "f4"),
        ("y", "f4"),
        ("z", "f4"),
        ("nx", "f4"),
        ("ny", "f4"),
        ("nz", "f4"),
        ("red", "u1"),
        ("green", "u1"),
        ("blue", "u1"),
    ]

    normals = np.zeros_like(xyz)

    elements = np.empty(xyz.shape[0], dtype=dtype)
    attributes = np.concatenate((xyz, normals, rgb), axis=1)
    elements[:] = list(map(tuple, attributes))

    # Create the PlyData object and write to file
    vertex_element = PlyElement.describe(elements, "vertex")
    ply_data = PlyData([vertex_element])
    ply_data.write(path)


def readColmapSceneInfo(path, images, eval, llffhold=8):
    try:
        cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin")
        cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.bin")
        cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file)
        cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file)
    except:
        cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.txt")
        cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.txt")
        cam_extrinsics = read_extrinsics_text(cameras_extrinsic_file)
        cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file)

    reading_dir = "images" if images == None else images
    cam_infos_unsorted = readColmapCameras(
        cam_extrinsics=cam_extrinsics,
        cam_intrinsics=cam_intrinsics,
        images_folder=os.path.join(path, reading_dir),
    )
    cam_infos = sorted(cam_infos_unsorted.copy(), key=lambda x: x.image_name)

    if eval:
        train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0]
        test_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold == 0]
    else:
        train_cam_infos = cam_infos
        test_cam_infos = []

    nerf_normalization = getNerfppNorm(train_cam_infos)

    ply_path = os.path.join(path, "sparse/0/points3D.ply")
    bin_path = os.path.join(path, "sparse/0/points3D.bin")
    txt_path = os.path.join(path, "sparse/0/points3D.txt")
    if not os.path.exists(ply_path):
        print(
            "Converting point3d.bin to .ply, will happen only the first time you open the scene."
        )
        try:
            xyz, rgb, _ = read_points3D_binary(bin_path)
        except:
            xyz, rgb, _ = read_points3D_text(txt_path)
        storePly(ply_path, xyz, rgb)
    try:
        pcd = fetchPly(ply_path)
    except:
        pcd = None

    scene_info = SceneInfo(
        point_cloud=pcd,
        train_cameras=train_cam_infos,
        test_cameras=test_cam_infos,
        nerf_normalization=nerf_normalization,
        ply_path=ply_path,
    )
    return scene_info


def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png"):
    cam_infos = []

    with open(os.path.join(path, transformsfile)) as json_file:
        contents = json.load(json_file)

        # camera_angle_x is the horizontal field of view
        # frames.file_path is the image name
        # frame.transform_matrix is the camera-to-world transform

        fovx = contents["camera_angle_x"]

        frames = contents["frames"]
        for idx, frame in enumerate(frames):
            cam_name = os.path.join(path, frame["file_path"] + extension)

            # NeRF 'transform_matrix' is a camera-to-world transform
            c2w = np.array(frame["transform_matrix"])
            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            c2w[:3, 1:3] *= -1

            # get the world-to-camera transform and set R, T
            w2c = np.linalg.inv(c2w)
            R = np.transpose(
                w2c[:3, :3]
            )  # R is stored transposed due to 'glm' in CUDA code
            T = w2c[:3, 3]

            image_path = os.path.join(path, cam_name)
            image_name = Path(cam_name).stem
            image = Image.open(image_path)

            im_data = np.array(image.convert("RGBA"))

            bg = np.array([1, 1, 1]) if white_background else np.array([0, 0, 0])

            norm_data = im_data / 255.0
            arr = norm_data[:, :, :3] * norm_data[:, :, 3:4] + bg * (
                1 - norm_data[:, :, 3:4]
            )
            image = Image.fromarray(np.array(arr * 255.0, dtype=np.byte), "RGB")

            fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1])
            FovY = fovy
            FovX = fovx

            cam_infos.append(
                CameraInfo(
                    uid=idx,
                    R=R,
                    T=T,
                    FovY=FovY,
                    FovX=FovX,
                    image=image,
                    image_path=image_path,
                    image_name=image_name,
                    width=image.size[0],
                    height=image.size[1],
                )
            )

    return cam_infos


def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
    print("Reading Training Transforms")
    train_cam_infos = readCamerasFromTransforms(
        path, "transforms_train.json", white_background, extension
    )
    print("Reading Test Transforms")
    test_cam_infos = readCamerasFromTransforms(
        path, "transforms_test.json", white_background, extension
    )

    if not eval:
        train_cam_infos.extend(test_cam_infos)
        test_cam_infos = []

    nerf_normalization = getNerfppNorm(train_cam_infos)

    ply_path = os.path.join(path, "points3d.ply")
    if not os.path.exists(ply_path):
        # Since this data set has no colmap data, we start with random points
        num_pts = 100_000
        print(f"Generating random point cloud ({num_pts})...")

        # We create random points inside the bounds of the synthetic Blender scenes
        xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
        shs = np.random.random((num_pts, 3)) / 255.0
        pcd = BasicPointCloud(
            points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))
        )

        storePly(ply_path, xyz, SH2RGB(shs) * 255)
    try:
        pcd = fetchPly(ply_path)
    except:
        pcd = None

    scene_info = SceneInfo(
        point_cloud=pcd,
        train_cameras=train_cam_infos,
        test_cameras=test_cam_infos,
        nerf_normalization=nerf_normalization,
        ply_path=ply_path,
    )
    return scene_info


sceneLoadTypeCallbacks = {
    "Colmap": readColmapSceneInfo,
    "Blender": readNerfSyntheticInfo,
}


# below used for easy rendering
class NoImageCamera(nn.Module):
    def __init__(
        self,
        colmap_id,
        R,
        T,
        FoVx,
        FoVy,
        width,
        height,
        uid,
        trans=np.array([0.0, 0.0, 0.0]),
        scale=1.0,
        data_device="cuda",
        img_path=None,  # not needed
    ):
        super(NoImageCamera, self).__init__()

        self.uid = uid
        self.colmap_id = colmap_id
        self.R = R
        self.T = T
        self.FoVx = FoVx
        self.FoVy = FoVy
        self.img_path = img_path

        try:
            self.data_device = torch.device(data_device)
        except Exception as e:
            print(e)
            print(
                f"[Warning] Custom device {data_device} failed, fallback to default cuda device"
            )
            self.data_device = torch.device("cuda")

        self.image_width = width
        self.image_height = height

        self.zfar = 100.0
        self.znear = 0.01

        self.trans = trans
        self.scale = scale

        # world to camera, then transpose.  # [4, 4]
        #  w2c.transpose
        self.world_view_transform = (
            torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda()
        )

        # [4, 4]
        self.projection_matrix = (
            getProjectionMatrix(
                znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy
            )
            .transpose(0, 1)
            .cuda()
        )

        # # [4, 4].  points @ full_proj_transform => screen space.
        self.full_proj_transform = (
            self.world_view_transform.unsqueeze(0).bmm(
                self.projection_matrix.unsqueeze(0)
            )
        ).squeeze(0)
        self.camera_center = self.world_view_transform.inverse()[3, :3]

        # [2, 2].
        #  (w2c @ p) / depth => cam_plane
        #  (p_in_cam / depth)[:2] @  cam_plane_2_img => [pixel_x, pixel_y]    cam_plane => img_plane
        self.cam_plane_2_img = torch.tensor(
            [
                [0.5 * width / math.tan(self.FoVx / 2.0), 0.0],
                [0.0, 0.5 * height / math.tan(self.FoVy / 2.0)],
            ]
        ).cuda()


def fast_read_cameras_from_transform_file(file_path, width=1080, height=720):
    cam_infos = []

    dir_name = os.path.dirname(file_path)

    with open(file_path) as json_file:
        contents = json.load(json_file)

        # camera_angle_x is the horizontal field of view
        # frames.file_path is the image name
        # frame.transform_matrix is the camera-to-world transform

        fovx = contents["camera_angle_x"]

        frames = contents["frames"]
        for idx, frame in enumerate(frames):
            # NeRF 'transform_matrix' is a camera-to-world transform
            c2w = np.array(frame["transform_matrix"])
            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            c2w[:3, 1:3] *= -1

            # get the world-to-camera transform and set R, T
            w2c = np.linalg.inv(c2w)
            R = np.transpose(
                w2c[:3, :3]
            )  # R is stored transposed due to 'glm' in CUDA code
            T = w2c[:3, 3]

            fovy = focal2fov(fov2focal(fovx, width), height)
            FovY = fovy
            FovX = fovx

            img_path = os.path.join(dir_name, frame["file_path"] + ".png")
            cam_ = NoImageCamera(
                colmap_id=idx,
                R=R,
                T=T,
                FoVx=FovX,
                FoVy=FovY,
                width=width,
                height=height,
                uid=id,
                data_device="cuda",
                img_path=img_path,
            )

            cam_infos.append(cam_)

    return cam_infos


================================================
FILE: physdreamer/gaussian_3d/scene/gaussian_model.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import numpy as np
from physdreamer.gaussian_3d.utils.general_utils import (
    inverse_sigmoid,
    get_expon_lr_func,
    build_rotation,
)
from torch import nn
import os
from physdreamer.gaussian_3d.utils.system_utils import mkdir_p
from plyfile import PlyData, PlyElement
from physdreamer.gaussian_3d.utils.sh_utils import RGB2SH
from simple_knn._C import distCUDA2
from physdreamer.gaussian_3d.utils.graphics_utils import BasicPointCloud
from physdreamer.gaussian_3d.utils.general_utils import (
    strip_symmetric,
    build_scaling_rotation,
)
from physdreamer.gaussian_3d.utils.rigid_body_utils import (
    get_rigid_transform,
    matrix_to_quaternion,
    quaternion_multiply,
)


class GaussianModel:
    def setup_functions(self):
        def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation):
            L = build_scaling_rotation(scaling_modifier * scaling, rotation)
            actual_covariance = L @ L.transpose(1, 2)
            symm = strip_symmetric(actual_covariance)
            return symm

        self.scaling_activation = torch.exp
        self.scaling_inverse_activation = torch.log

        self.covariance_activation = build_covariance_from_scaling_rotation

        self.opacity_activation = torch.sigmoid
        self.inverse_opacity_activation = inverse_sigmoid

        self.rotation_activation = torch.nn.functional.normalize

    def __init__(self, sh_degree: int = 3):
        self.active_sh_degree = 0
        self.max_sh_degree = sh_degree
        self._xyz = torch.empty(0)
        self._features_dc = torch.empty(0)
        self._features_rest = torch.empty(0)
        self._scaling = torch.empty(0)
        self._rotation = torch.empty(0)
        self._opacity = torch.empty(0)
        self.max_radii2D = torch.empty(0)
        self.xyz_gradient_accum = torch.empty(0)
        self.denom = torch.empty(0)
        self.optimizer = None
        self.percent_dense = 0
        self.spatial_lr_scale = 0
        self.setup_functions()

        self.matched_inds = None

    def capture(self):
        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        return (
            self.active_sh_degree,
            self._xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

    def restore(self, model_args, training_args):
        (
            self.active_sh_degree,
            self._xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            xyz_gradient_accum,
            denom,
            opt_dict,
            self.spatial_lr_scale,
        ) = model_args

        if training_args is not None:
            self.training_setup(training_args)
        self.xyz_gradient_accum = xyz_gradient_accum
        self.denom = denom
        if opt_dict is not None:
            self.optimizer.load_state_dict(opt_dict)

    def capture_training_args(
        self,
    ):
        pass

    @property
    def get_scaling(self):
        return self.scaling_activation(self._scaling)

    @property
    def get_rotation(self):
        return self.rotation_activation(self._rotation)

    @property
    def get_xyz(self):
        return self._xyz

    @property
    def get_features(self):
        features_dc = self._features_dc
        features_rest = self._features_rest
        return torch.cat((features_dc, features_rest), dim=1)

    @property
    def get_opacity(self):
        return self.opacity_activation(self._opacity)

    def get_covariance(self, scaling_modifier=1):
        return self.covariance_activation(
            self.get_scaling, scaling_modifier, self._rotation
        )

    def oneupSHdegree(self):
        if self.active_sh_degree < self.max_sh_degree:
            self.active_sh_degree += 1

    def create_from_pcd(self, pcd: BasicPointCloud, spatial_lr_scale: float):
        self.spatial_lr_scale = spatial_lr_scale
        fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda()
        fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda())
        features = (
            torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2))
            .float()
            .cuda()
        )
        features[:, :3, 0] = fused_color
        # typo here?
        features[:, 3:, 1:] = 0.0

        print("Number of points at initialisation : ", fused_point_cloud.shape[0])

        dist2 = torch.clamp_min(
            distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()),
            0.0000001,
        )
        scales = torch.log(torch.sqrt(dist2))[..., None].repeat(1, 3)
        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
        rots[:, 0] = 1

        opacities = inverse_sigmoid(
            0.1
            * torch.ones(
                (fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"
            )
        )

        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
        self._features_dc = nn.Parameter(
            features[:, :, 0:1].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._features_rest = nn.Parameter(
            features[:, :, 1:].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._scaling = nn.Parameter(scales.requires_grad_(True))
        self._rotation = nn.Parameter(rots.requires_grad_(True))
        self._opacity = nn.Parameter(opacities.requires_grad_(True))
        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")

    def training_setup(self, training_args):
        self.percent_dense = training_args.percent_dense
        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")

        l = [
            {
                "params": [self._xyz],
                "lr": training_args.position_lr_init * self.spatial_lr_scale,
                "name": "xyz",
            },
            {
                "params": [self._features_dc],
                "lr": training_args.feature_lr,
                "name": "f_dc",
            },
            {
                "params": [self._features_rest],
                "lr": training_args.feature_lr / 20.0,
                "name": "f_rest",
            },
            {
                "params": [self._opacity],
                "lr": training_args.opacity_lr,
                "name": "opacity",
            },
            {
                "params": [self._scaling],
                "lr": training_args.scaling_lr,
                "name": "scaling",
            },
            {
                "params": [self._rotation],
                "lr": training_args.rotation_lr,
                "name": "rotation",
            },
        ]

        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
        self.xyz_scheduler_args = get_expon_lr_func(
            lr_init=training_args.position_lr_init * self.spatial_lr_scale,
            lr_final=training_args.position_lr_final * self.spatial_lr_scale,
            lr_delay_mult=training_args.position_lr_delay_mult,
            max_steps=training_args.position_lr_max_steps,
        )

    def update_learning_rate(self, iteration):
        """Learning rate scheduling per step"""
        for param_group in self.optimizer.param_groups:
            if param_group["name"] == "xyz":
                lr = self.xyz_scheduler_args(iteration)
                param_group["lr"] = lr
                return lr

    def construct_list_of_attributes(self):
        l = ["x", "y", "z", "nx", "ny", "nz"]
        # All channels except the 3 DC
        for i in range(self._features_dc.shape[1] * self._features_dc.shape[2]):
            l.append("f_dc_{}".format(i))
        for i in range(self._features_rest.shape[1] * self._features_rest.shape[2]):
            l.append("f_rest_{}".format(i))
        l.append("opacity")
        for i in range(self._scaling.shape[1]):
            l.append("scale_{}".format(i))
        for i in range(self._rotation.shape[1]):
            l.append("rot_{}".format(i))
        return l

    def save_ply(self, path):
        mkdir_p(os.path.dirname(path))

        xyz = self._xyz.detach().cpu().numpy()
        normals = np.zeros_like(xyz)
        f_dc = (
            self._features_dc.detach()
            .transpose(1, 2)
            .flatten(start_dim=1)
            .contiguous()
            .cpu()
            .numpy()
        )
        f_rest = (
            self._features_rest.detach()
            .transpose(1, 2)
            .flatten(start_dim=1)
            .contiguous()
            .cpu()
            .numpy()
        )
        opacities = self._opacity.detach().cpu().numpy()
        scale = self._scaling.detach().cpu().numpy()
        rotation = self._rotation.detach().cpu().numpy()

        dtype_full = [
            (attribute, "f4") for attribute in self.construct_list_of_attributes()
        ]

        elements = np.empty(xyz.shape[0], dtype=dtype_full)
        attributes = np.concatenate(
            (xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1
        )
        elements[:] = list(map(tuple, attributes))
        el = PlyElement.describe(elements, "vertex")
        PlyData([el]).write(path)

    def reset_opacity(self):
        opacities_new = inverse_sigmoid(
            torch.min(self.get_opacity, torch.ones_like(self.get_opacity) * 0.01)
        )
        optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
        self._opacity = optimizable_tensors["opacity"]

    def load_ply(self, path):
        plydata = PlyData.read(path)

        xyz = np.stack(
            (
                np.asarray(plydata.elements[0]["x"]),
                np.asarray(plydata.elements[0]["y"]),
                np.asarray(plydata.elements[0]["z"]),
            ),
            axis=1,
        )
        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]

        features_dc = np.zeros((xyz.shape[0], 3, 1))
        features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
        features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"])
        features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"])

        extra_f_names = [
            p.name
            for p in plydata.elements[0].properties
            if p.name.startswith("f_rest_")
        ]
        extra_f_names = sorted(extra_f_names, key=lambda x: int(x.split("_")[-1]))
        assert len(extra_f_names) == 3 * (self.max_sh_degree + 1) ** 2 - 3
        features_extra = np.zeros((xyz.shape[0], len(extra_f_names)))
        for idx, attr_name in enumerate(extra_f_names):
            features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name])
        # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC)
        features_extra = features_extra.reshape(
            (features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)
        )

        scale_names = [
            p.name
            for p in plydata.elements[0].properties
            if p.name.startswith("scale_")
        ]
        scale_names = sorted(scale_names, key=lambda x: int(x.split("_")[-1]))
        scales = np.zeros((xyz.shape[0], len(scale_names)))
        for idx, attr_name in enumerate(scale_names):
            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])

        rot_names = [
            p.name for p in plydata.elements[0].properties if p.name.startswith("rot")
        ]
        rot_names = sorted(rot_names, key=lambda x: int(x.split("_")[-1]))
        rots = np.zeros((xyz.shape[0], len(rot_names)))
        for idx, attr_name in enumerate(rot_names):
            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])

        self._xyz = nn.Parameter(
            torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)
        )
        self._features_dc = nn.Parameter(
            torch.tensor(features_dc, dtype=torch.float, device="cuda")
            .transpose(1, 2)
            .contiguous()
            .requires_grad_(True)
        )
        self._features_rest = nn.Parameter(
            torch.tensor(features_extra, dtype=torch.float, device="cuda")
            .transpose(1, 2)
            .contiguous()
            .requires_grad_(True)
        )
        self._opacity = nn.Parameter(
            torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(
                True
            )
        )
        self._scaling = nn.Parameter(
            torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)
        )
        self._rotation = nn.Parameter(
            torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)
        )

        self.active_sh_degree = self.max_sh_degree

    def replace_tensor_to_optimizer(self, tensor, name):
        optimizable_tensors = {}
        for group in self.optimizer.param_groups:
            if group["name"] == name:
                stored_state = self.optimizer.state.get(group["params"][0], None)
                stored_state["exp_avg"] = torch.zeros_like(tensor)
                stored_state["exp_avg_sq"] = torch.zeros_like(tensor)

                del self.optimizer.state[group["params"][0]]
                group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
                self.optimizer.state[group["params"][0]] = stored_state

                optimizable_tensors[group["name"]] = group["params"][0]
        return optimizable_tensors

    def _prune_optimizer(self, mask):
        optimizable_tensors = {}
        for group in self.optimizer.param_groups:
            stored_state = self.optimizer.state.get(group["params"][0], None)
            if stored_state is not None:
                stored_state["exp_avg"] = stored_state["exp_avg"][mask]
                stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]

                del self.optimizer.state[group["params"][0]]
                group["params"][0] = nn.Parameter(
                    (group["params"][0][mask].requires_grad_(True))
                )
                self.optimizer.state[group["params"][0]] = stored_state

                optimizable_tensors[group["name"]] = group["params"][0]
            else:
                group["params"][0] = nn.Parameter(
                    group["params"][0][mask].requires_grad_(True)
                )
                optimizable_tensors[group["name"]] = group["params"][0]
        return optimizable_tensors

    def prune_points(self, mask):
        valid_points_mask = ~mask
        optimizable_tensors = self._prune_optimizer(valid_points_mask)

        self._xyz = optimizable_tensors["xyz"]
        self._features_dc = optimizable_tensors["f_dc"]
        self._features_rest = optimizable_tensors["f_rest"]
        self._opacity = optimizable_tensors["opacity"]
        self._scaling = optimizable_tensors["scaling"]
        self._rotation = optimizable_tensors["rotation"]

        self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]

        self.denom = self.denom[valid_points_mask]
        self.max_radii2D = self.max_radii2D[valid_points_mask]

    def cat_tensors_to_optimizer(self, tensors_dict):
        optimizable_tensors = {}
        for group in self.optimizer.param_groups:
            assert len(group["params"]) == 1
            extension_tensor = tensors_dict[group["name"]]
            stored_state = self.optimizer.state.get(group["params"][0], None)
            if stored_state is not None:
                stored_state["exp_avg"] = torch.cat(
                    (stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0
                )
                stored_state["exp_avg_sq"] = torch.cat(
                    (stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)),
                    dim=0,
                )

                del self.optimizer.state[group["params"][0]]
                group["params"][0] = nn.Parameter(
                    torch.cat(
                        (group["params"][0], extension_tensor), dim=0
                    ).requires_grad_(True)
                )
                self.optimizer.state[group["params"][0]] = stored_state

                optimizable_tensors[group["name"]] = group["params"][0]
            else:
                group["params"][0] = nn.Parameter(
                    torch.cat(
                        (group["params"][0], extension_tensor), dim=0
                    ).requires_grad_(True)
                )
                optimizable_tensors[group["name"]] = group["params"][0]

        return optimizable_tensors

    def densification_postfix(
        self,
        new_xyz,
        new_features_dc,
        new_features_rest,
        new_opacities,
        new_scaling,
        new_rotation,
    ):
        d = {
            "xyz": new_xyz,
            "f_dc": new_features_dc,
            "f_rest": new_features_rest,
            "opacity": new_opacities,
            "scaling": new_scaling,
            "rotation": new_rotation,
        }

        optimizable_tensors = self.cat_tensors_to_optimizer(d)
        self._xyz = optimizable_tensors["xyz"]
        self._features_dc = optimizable_tensors["f_dc"]
        self._features_rest = optimizable_tensors["f_rest"]
        self._opacity = optimizable_tensors["opacity"]
        self._scaling = optimizable_tensors["scaling"]
        self._rotation = optimizable_tensors["rotation"]

        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")

    def densify_and_split(self, grads, grad_threshold, scene_extent, N=2):
        n_init_points = self.get_xyz.shape[0]
        # Extract points that satisfy the gradient condition
        padded_grad = torch.zeros((n_init_points), device="cuda")
        padded_grad[: grads.shape[0]] = grads.squeeze()
        selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False)
        selected_pts_mask = torch.logical_and(
            selected_pts_mask,
            torch.max(self.get_scaling, dim=1).values
            > self.percent_dense * scene_extent,
        )

        stds = self.get_scaling[selected_pts_mask].repeat(N, 1)
        means = torch.zeros((stds.size(0), 3), device="cuda")
        samples = torch.normal(mean=means, std=stds)
        rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N, 1, 1)
        new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[
            selected_pts_mask
        ].repeat(N, 1)
        new_scaling = self.scaling_inverse_activation(
            self.get_scaling[selected_pts_mask].repeat(N, 1) / (0.8 * N)
        )
        new_rotation = self._rotation[selected_pts_mask].repeat(N, 1)
        new_features_dc = self._features_dc[selected_pts_mask].repeat(N, 1, 1)
        new_features_rest = self._features_rest[selected_pts_mask].repeat(N, 1, 1)
        new_opacity = self._opacity[selected_pts_mask].repeat(N, 1)

        self.densification_postfix(
            new_xyz,
            new_features_dc,
            new_features_rest,
            new_opacity,
            new_scaling,
            new_rotation,
        )

        prune_filter = torch.cat(
            (
                selected_pts_mask,
                torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool),
            )
        )
        self.prune_points(prune_filter)

    def densify_and_clone(self, grads, grad_threshold, scene_extent):
        # Extract points that satisfy the gradient condition
        selected_pts_mask = torch.where(
            torch.norm(grads, dim=-1) >= grad_threshold, True, False
        )
        selected_pts_mask = torch.logical_and(
            selected_pts_mask,
            torch.max(self.get_scaling, dim=1).values
            <= self.percent_dense * scene_extent,
        )

        new_xyz = self._xyz[selected_pts_mask]
        new_features_dc = self._features_dc[selected_pts_mask]
        new_features_rest = self._features_rest[selected_pts_mask]
        new_opacities = self._opacity[selected_pts_mask]
        new_scaling = self._scaling[selected_pts_mask]
        new_rotation = self._rotation[selected_pts_mask]

        self.densification_postfix(
            new_xyz,
            new_features_dc,
            new_features_rest,
            new_opacities,
            new_scaling,
            new_rotation,
        )

    def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size):
        grads = self.xyz_gradient_accum / self.denom
        grads[grads.isnan()] = 0.0

        self.densify_and_clone(grads, max_grad, extent)
        self.densify_and_split(grads, max_grad, extent)

        prune_mask = (self.get_opacity < min_opacity).squeeze()
        if max_screen_size:
            big_points_vs = self.max_radii2D > max_screen_size
            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
            prune_mask = torch.logical_or(
                torch.logical_or(prune_mask, big_points_vs), big_points_ws
            )
        self.prune_points(prune_mask)

        torch.cuda.empty_cache()

    def add_densification_stats(self, viewspace_point_tensor, update_filter):
        self.xyz_gradient_accum[update_filter] += torch.norm(
            viewspace_point_tensor.grad[update_filter, :2], dim=-1, keepdim=True
        )
        self.denom[update_filter] += 1

    def apply_discrete_offset_filds(self, origin_points, offsets):
        """
        Args:
            origin_points: (N_r, 3)
            offsets: (N_r, 3)
        """

        # since origin points and self._xyz might not be matched, we need to first
        #   compute the distance between origin points and self._xyz
        #   then find the nearest point in self._xyz for each origin point

        # compute the distance between origin points and self._xyz
        # [N_r, num_points]
        dist = torch.cdist(origin_points, self._xyz)
        # find the nearest point in self._xyz for each origin point
        _, idx = torch.min(dist, dim=0)

        # apply offsets

        new_xyz = self._xyz + offsets[idx]

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_discrete_offset_filds_with_R(self, origin_points, offsets, topk=6):
        """
        Args:
            origin_points: (N_r, 3)
            offsets: (N_r, 3)
        """

        # since origin points and self._xyz might not be matched, we need to first
        #   compute the distance between origin points and self._xyz
        #   then find the nearest point in self._xyz for each origin point

        if self.matched_inds is None:
            # compute the distance between origin points and self._xyz
            # [N_r, num_points]
            dist = torch.cdist(origin_points, self._xyz) * -1.0
            # find the nearest point in self._xyz for each origin point

            # idxs: [topk, num_points]
            print(dist.shape, topk, dist[0])
            _, idxs = torch.topk(dist, topk, dim=0)

            self.matched_inds = idxs
        else:
            idxs = self.matched_inds

        # [topk, num_points, 3] => [num_points, topk, 3]
        matched_topk_offsets = offsets[idxs].transpose(0, 1)
        source_points = origin_points[idxs].transpose(0, 1)

        # [num_points, 3, 3/1]
        R, t = get_rigid_transform(source_points, source_points + matched_topk_offsets)

        # new_xyz = R @ self._xyz.unsqueeze(dim=-1) + t
        # new_xyz = new_xyz.squeeze(dim=-1)

        avg_offsets = matched_topk_offsets.mean(dim=1)
        new_xyz = self._xyz + avg_offsets  # offset directly

        new_rotation = quaternion_multiply(matrix_to_quaternion(R), self._rotation)

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            new_rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_se3_fields(
        self,
        se3_model,
        timestamp: float,
        freeze_mask=None,
    ):
        """
        Args:
            se3_model: SE3Model
            timestamp: float.  in range [0, 1]
            freeze_mask: [N]
        """

        inp_time = torch.ones_like(self._xyz[:, 0:1]) * timestamp
        inp = torch.cat([self._xyz, inp_time], dim=-1)

        if freeze_mask is not None:
            moving_mask = torch.logical_not(freeze_mask)
            inp = inp[moving_mask, ...]
        # [bs, 3, 3]. [bs, 3]
        R, t = se3_model(inp)

        # print("abs t mean", torch.abs(t).mean(dim=0))
        # new_xyz = (R @ self._xyz.unsqueeze(dim=-1)).squeeze(dim=-1) + t

        if freeze_mask is None:
            new_xyz = self._xyz + t
            new_rotation = quaternion_multiply(matrix_to_quaternion(R), self._rotation)
        else:
            new_xyz = self._xyz.clone()
            new_xyz[moving_mask, ...] += t
            new_rotation = self._rotation.clone()
            new_rotation[moving_mask, ...] = quaternion_multiply(
                matrix_to_quaternion(R), self._rotation[moving_mask, ...]
            )

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            new_rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_offset_fields(self, offset_field, timestamp: float):
        """
        Args:
            se3_model: SE3Model
            timestamp: float.  in range [0, 1]
        """

        inp_time = torch.ones_like(self._xyz[:, 0:1]) * timestamp
        inp = torch.cat([self._xyz, inp_time], dim=-1)
        # [bs, 3, 3]. [bs, 3]
        offsets = offset_field(inp)

        # print("abs t mean", torch.abs(t).mean(dim=0))
        new_xyz = self._xyz + offsets

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_offset_fields_with_R(self, offset_field, timestamp: float, eps=1e-2):
        """
        Args:
            se3_model: SE3Model
            timestamp: float.  in range [0, 1]
        """

        # [4, 3]
        inp_perterb = (
            torch.tensor(
                [
                    [0.0, 0.0, 0.0],  # add this will coplanar?
                    [+eps, -eps, -eps],
                    [-eps, -eps, +eps],
                    [-eps, +eps, -eps],
                    [+eps, +eps, +eps],
                ],
            )
            .to(self._xyz.device)
            .float()
        )
        #  => [N, 4, 3]
        source_points = self._xyz.unsqueeze(dim=1) + inp_perterb.unsqueeze(dim=0)
        num_points = source_points.shape[0]

        inpx = source_points.flatten(end_dim=1)
        inp_time = torch.ones_like(inpx[:, 0:1]) * timestamp

        inp = torch.cat([inpx, inp_time], dim=-1)

        sampled_offsets = offset_field(inp).reshape((num_points, -1, 3))

        R, t = get_rigid_transform(source_points, source_points + sampled_offsets)

        # new_xyz = R @ self._xyz.unsqueeze(dim=-1) + t
        # new_xyz = new_xyz.squeeze(dim=-1)

        avg_offsets = sampled_offsets.mean(dim=1)
        new_xyz = self._xyz + avg_offsets  # offset directly

        new_rotation = quaternion_multiply(matrix_to_quaternion(R), self._rotation)

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            new_rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def init_from_mesh(
        self,
        mesh_path: str,
        num_gaussians: int = 10000,
    ):
        import point_cloud_utils as pcu

        mesh = pcu.load_triangle_mesh(mesh_path)

        v, f = mesh.v, mesh.f

        v_n = pcu.estimate_mesh_normals(v, f)
        vert_colors = mesh.vertex_data.colors

        fid, bc = pcu.sample_mesh_random(v, f, num_gaussians)

        # Interpolate the vertex positions and normals using the returned barycentric coordinates
        # to get sample positions and normals
        rand_positions = pcu.interpolate_barycentric_coords(f, fid, bc, v)
        rand_normals = pcu.interpolate_barycentric_coords(f, fid, bc, v_n)
        rand_colors = pcu.interpolate_barycentric_coords(f, fid, bc, vert_colors)[:, :3]

        # copy original pointcloud init functions

        fused_point_cloud = torch.tensor(np.asarray(rand_positions)).float().cuda()
        fused_color = RGB2SH(torch.tensor(np.asarray(rand_colors)).float().cuda())
        features = (
            torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2))
            .float()
            .cuda()
        )
        features[:, :3, 0] = fused_color
        # typo here?
        features[:, 3:, 1:] = 0.0

        print("Number of points at initialisation : ", fused_point_cloud.shape[0])

        dist2 = torch.clamp_min(
            distCUDA2(torch.from_numpy(np.asarray(rand_positions)).float().cuda()),
            0.0000001,
        )
        scales = torch.log(torch.sqrt(dist2))[..., None].repeat(1, 3)
        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
        rots[:, 0] = 1

        opacities = inverse_sigmoid(
            0.1
            * torch.ones(
                (fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"
            )
        )

        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
        self._features_dc = nn.Parameter(
            features[:, :, 0:1].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._features_rest = nn.Parameter(
            features[:, :, 1:].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._scaling = nn.Parameter(scales.requires_grad_(True))
        self._rotation = nn.Parameter(rots.requires_grad_(True))
        self._opacity = nn.Parameter(opacities.requires_grad_(True))
        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")

    def detach_grad(
        self,
    ):
        self._xyz.requires_grad = False
        self._features_dc.requires_grad = False
        self._features_rest.requires_grad = False
        self._scaling.requires_grad = False
        self._rotation.requires_grad = False
        self._opacity.requires_grad = False

    def apply_mask(self, mask):
        new_xyz = self._xyz[mask]
        if self.xyz_gradient_accum.shape == self._xyz.shape:
            new_xyz_gradient_accum = self.xyz_gradient_accum[mask]
            new_denom = self.denom[mask]
        else:
            new_xyz_gradient_accum = self.xyz_gradient_accum
            new_denom = self.denom
        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc[mask],
            self._features_rest[mask],
            self._scaling[mask],
            self._rotation[mask],
            self._opacity[mask],
            self.max_radii2D,
            new_xyz_gradient_accum,
            new_denom,
            None,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    @torch.no_grad()
    def extract_fields(self, resolution=128, num_blocks=16, relax_ratio=1.5):
        # resolution: resolution of field

        block_size = 2 / num_blocks

        assert resolution % block_size == 0
        split_size = resolution // num_blocks

        opacities = self.get_opacity

        # pre-filter low opacity gaussians to save computation
        mask = (opacities > 0.005).squeeze(1)

        opacities = opacities[mask]
        xyzs = self.get_xyz[mask]
        stds = self.get_scaling[mask]

        # normalize to ~ [-1, 1]
        mn, mx = xyzs.amin(0), xyzs.amax(0)
        self.center = (mn + mx) / 2
        self.scale = 1.0 / (mx - mn).amax().item()

        print("gaussian center, scale", self.center, self.scale)
        xyzs = (xyzs - self.center) * self.scale
        stds = stds * self.scale

        covs = self.covariance_activation(stds, 1, self._rotation[mask])

        # tile
        device = opacities.device
        occ = torch.zeros([resolution] * 3, dtype=torch.float32, device=device)

        X = torch.linspace(-1, 1, resolution).split(split_size)
        Y = torch.linspace(-1, 1, resolution).split(split_size)
        Z = torch.linspace(-1, 1, resolution).split(split_size)

        # loop blocks (assume max size of gaussian is small than relax_ratio * block_size !!!)
        for xi, xs in enumerate(X):
            for yi, ys in enumerate(Y):
                for zi, zs in enumerate(Z):
                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
                    # sample points [M, 3]
                    pts = torch.cat(
                        [xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)],
                        dim=-1,
                    ).to(device)
                    # in-tile gaussians mask
                    vmin, vmax = pts.amin(0), pts.amax(0)
                    vmin -= block_size * relax_ratio
                    vmax += block_size * relax_ratio
                    mask = (xyzs < vmax).all(-1) & (xyzs > vmin).all(-1)
                    # if hit no gaussian, continue to next block
                    if not mask.any():
                        continue
                    mask_xyzs = xyzs[mask]  # [L, 3]
                    mask_covs = covs[mask]  # [L, 6]
                    mask_opas = opacities[mask].view(1, -1)  # [L, 1] --> [1, L]

                    # query per point-gaussian pair.
                    g_pts = pts.unsqueeze(1).repeat(
                        1, mask_covs.shape[0], 1
                    ) - mask_xyzs.unsqueeze(
                        0
                    )  # [M, L, 3]
                    g_covs = mask_covs.unsqueeze(0).repeat(
                        pts.shape[0], 1, 1
                    )  # [M, L, 6]

                    # batch on gaussian to avoid OOM
                    batch_g = 1024
                    val = 0
                    for start in range(0, g_covs.shape[1], batch_g):
                        end = min(start + batch_g, g_covs.shape[1])
                        w = gaussian_3d_coeff(
                            g_pts[:, start:end].reshape(-1, 3),
                            g_covs[:, start:end].reshape(-1, 6),
                        ).reshape(
                            pts.shape[0], -1
                        )  # [M, l]
                        val += (mask_opas[:, start:end] * w).sum(-1)

                    # kiui.lo(val, mask_opas, w)

                    occ[
                        xi * split_size : xi * split_size + len(xs),
                        yi * split_size : yi * split_size + len(ys),
                        zi * split_size : zi * split_size + len(zs),
                    ] = val.reshape(len(xs), len(ys), len(zs))

        return occ

    def extract_mesh(self, path, density_thresh=1, resolution=128, decimate_target=1e5):
        os.makedirs(os.path.dirname(path), exist_ok=True)

        from physdreamer.gaussian_3d.scene.mesh import Mesh
        from physdreamer.gaussian_3d.scene.mesh_utils import decimate_mesh, clean_mesh

        occ = self.extract_fields(resolution).detach().cpu().numpy()

        print(occ.shape, occ.min(), occ.max(), occ.mean(), "occ stats")
        print(np.percentile(occ, [0, 1, 5, 10, 50, 90, 95, 99, 100]), "occ percentiles")
        import mcubes

        vertices, triangles = mcubes.marching_cubes(occ, density_thresh)
        vertices = vertices / (resolution - 1.0) * 2 - 1

        # transform back to the original space
        vertices = vertices / self.scale + self.center.detach().cpu().numpy()

        vertices, triangles = clean_mesh(
            vertices, triangles, remesh=True, remesh_size=0.015
        )
        if decimate_target > 0 and triangles.shape[0] > decimate_target:
            vertices, triangles = decimate_mesh(vertices, triangles, decimate_target)

        v = torch.from_numpy(vertices.astype(np.float32)).contiguous().cuda()
        f = torch.from_numpy(triangles.astype(np.int32)).contiguous().cuda()

        print(
            f"[INFO] marching cubes result: {v.shape} ({v.min().item()}-{v.max().item()}), {f.shape}"
        )

        mesh = Mesh(v=v, f=f, device="cuda")

        return mesh


def gaussian_3d_coeff(xyzs, covs):
    # xyzs: [N, 3]
    # covs: [N, 6]
    x, y, z = xyzs[:, 0], xyzs[:, 1], xyzs[:, 2]
    a, b, c, d, e, f = (
        covs[:, 0],
        covs[:, 1],
        covs[:, 2],
        covs[:, 3],
        covs[:, 4],
        covs[:, 5],
    )

    # eps must be small enough !!!
    inv_det = 1 / (a * d * f + 2 * e * c * b - e**2 * a - c**2 * d - b**2 * f + 1e-24)
    inv_a = (d * f - e**2) * inv_det
    inv_b = (e * c - b * f) * inv_det
    inv_c = (e * b - c * d) * inv_det
    inv_d = (a * f - c**2) * inv_det
    inv_e = (b * c - e * a) * inv_det
    inv_f = (a * d - b**2) * inv_det

    power = (
        -0.5 * (x**2 * inv_a + y**2 * inv_d + z**2 * inv_f)
        - x * y * inv_b
        - x * z * inv_c
        - y * z * inv_e
    )

    power[power > 0] = -1e10  # abnormal values... make weights 0

    return torch.exp(power)


================================================
FILE: physdreamer/gaussian_3d/scene/mesh.py
================================================
import os
import cv2
import torch
import trimesh
import numpy as np


def dot(x, y):
    return torch.sum(x * y, -1, keepdim=True)


def length(x, eps=1e-20):
    return torch.sqrt(torch.clamp(dot(x, x), min=eps))


def safe_normalize(x, eps=1e-20):
    return x / length(x, eps)


class Mesh:
    def __init__(
        self,
        v=None,
        f=None,
        vn=None,
        fn=None,
        vt=None,
        ft=None,
        albedo=None,
        vc=None,  # vertex color
        device=None,
    ):
        self.device = device
        self.v = v
        self.vn = vn
        self.vt = vt
        self.f = f
        self.fn = fn
        self.ft = ft
        # only support a single albedo
        self.albedo = albedo
        # support vertex color is no albedo
        self.vc = vc

        self.ori_center = 0
        self.ori_scale = 1

    @classmethod
    def load(
        cls,
        path=None,
        resize=True,
        renormal=True,
        retex=False,
        front_dir="+z",
        **kwargs,
    ):
        # assume init with kwargs
        if path is None:
            mesh = cls(**kwargs)
        # obj supports face uv
        elif path.endswith(".obj"):
            mesh = cls.load_obj(path, **kwargs)
        # trimesh only supports vertex uv, but can load more formats
        else:
            mesh = cls.load_trimesh(path, **kwargs)

        print(f"[Mesh loading] v: {mesh.v.shape}, f: {mesh.f.shape}")
        # auto-normalize
        if resize:
            mesh.auto_size()
        # auto-fix normal
        if renormal or mesh.vn is None:
            mesh.auto_normal()
            print(f"[Mesh loading] vn: {mesh.vn.shape}, fn: {mesh.fn.shape}")
        # auto-fix texcoords
        if retex or (mesh.albedo is not None and mesh.vt is None):
            mesh.auto_uv(cache_path=path)
            print(f"[Mesh loading] vt: {mesh.vt.shape}, ft: {mesh.ft.shape}")

        # rotate front dir to +z
        if front_dir != "+z":
            # axis switch
            if "-z" in front_dir:
                T = torch.tensor(
                    [[1, 0, 0], [0, 1, 0], [0, 0, -1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "+x" in front_dir:
                T = torch.tensor(
                    [[0, 0, 1], [0, 1, 0], [1, 0, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "-x" in front_dir:
                T = torch.tensor(
                    [[0, 0, -1], [0, 1, 0], [1, 0, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "+y" in front_dir:
                T = torch.tensor(
                    [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "-y" in front_dir:
                T = torch.tensor(
                    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            else:
                T = torch.tensor(
                    [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            # rotation (how many 90 degrees)
            if "1" in front_dir:
                T @= torch.tensor(
                    [[0, -1, 0], [1, 0, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "2" in front_dir:
                T @= torch.tensor(
                    [[1, 0, 0], [0, -1, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "3" in front_dir:
                T @= torch.tensor(
                    [[0, 1, 0], [-1, 0, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            mesh.v @= T
            mesh.vn @= T

        return mesh

    # load from obj file
    @classmethod
    def load_obj(cls, path, albedo_path=None, device=None):
        assert os.path.splitext(path)[-1] == ".obj"

        mesh = cls()

        # device
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        mesh.device = device

        # load obj
        with open(path, "r") as f:
            lines = f.readlines()

        def parse_f_v(fv):
            # pass in a vertex term of a face, return {v, vt, vn} (-1 if not provided)
            # supported forms:
            # f v1 v2 v3
            # f v1/vt1 v2/vt2 v3/vt3
            # f v1/vt1/vn1 v2/vt2/vn2 v3/vt3/vn3
            # f v1//vn1 v2//vn2 v3//vn3
            xs = [int(x) - 1 if x != "" else -1 for x in fv.split("/")]
            xs.extend([-1] * (3 - len(xs)))
            return xs[0], xs[1], xs[2]

        # NOTE: we ignore usemtl, and assume the mesh ONLY uses one material (first in mtl)
        vertices, texcoords, normals = [], [], []
        faces, tfaces, nfaces = [], [], []
        mtl_path = None

        for line in lines:
            split_line = line.split()
            # empty line
            if len(split_line) == 0:
                continue
            prefix = split_line[0].lower()
            # mtllib
            if prefix == "mtllib":
                mtl_path = split_line[1]
            # usemtl
            elif prefix == "usemtl":
                pass  # ignored
            # v/vn/vt
            elif prefix == "v":
                vertices.append([float(v) for v in split_line[1:]])
            elif prefix == "vn":
                normals.append([float(v) for v in split_line[1:]])
            elif prefix == "vt":
                val = [float(v) for v in split_line[1:]]
                texcoords.append([val[0], 1.0 - val[1]])
            elif prefix == "f":
                vs = split_line[1:]
                nv = len(vs)
                v0, t0, n0 = parse_f_v(vs[0])
                for i in range(nv - 2):  # triangulate (assume vertices are ordered)
                    v1, t1, n1 = parse_f_v(vs[i + 1])
                    v2, t2, n2 = parse_f_v(vs[i + 2])
                    faces.append([v0, v1, v2])
                    tfaces.append([t0, t1, t2])
                    nfaces.append([n0, n1, n2])

        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
        mesh.vt = (
            torch.tensor(texcoords, dtype=torch.float32, device=device)
            if len(texcoords) > 0
            else None
        )
        mesh.vn = (
            torch.tensor(normals, dtype=torch.float32, device=device)
            if len(normals) > 0
            else None
        )

        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
        mesh.ft = (
            torch.tensor(tfaces, dtype=torch.int32, device=device)
            if len(texcoords) > 0
            else None
        )
        mesh.fn = (
            torch.tensor(nfaces, dtype=torch.int32, device=device)
            if len(normals) > 0
            else None
        )

        # see if there is vertex color
        use_vertex_color = False
        if mesh.v.shape[1] == 6:
            use_vertex_color = True
            mesh.vc = mesh.v[:, 3:]
            mesh.v = mesh.v[:, :3]
            print(f"[load_obj] use vertex color: {mesh.vc.shape}")

        # try to load texture image
        if not use_vertex_color:
            # try to retrieve mtl file
            mtl_path_candidates = []
            if mtl_path is not None:
                mtl_path_candidates.append(mtl_path)
                mtl_path_candidates.append(
                    os.path.join(os.path.dirname(path), mtl_path)
                )
            mtl_path_candidates.append(path.replace(".obj", ".mtl"))

            mtl_path = None
            for candidate in mtl_path_candidates:
                if os.path.exists(candidate):
                    mtl_path = candidate
                    break

            # if albedo_path is not provided, try retrieve it from mtl
            if mtl_path is not None and albedo_path is None:
                with open(mtl_path, "r") as f:
                    lines = f.readlines()
                for line in lines:
                    split_line = line.split()
                    # empty line
                    if len(split_line) == 0:
                        continue
                    prefix = split_line[0]
                    # NOTE: simply use the first map_Kd as albedo!
                    if "map_Kd" in prefix:
                        albedo_path = os.path.join(os.path.dirname(path), split_line[1])
                        print(f"[load_obj] use texture from: {albedo_path}")
                        break

            # still not found albedo_path, or the path doesn't exist
            if albedo_path is None or not os.path.exists(albedo_path):
                # init an empty texture
                print(f"[load_obj] init empty albedo!")
                # albedo = np.random.rand(1024, 1024, 3).astype(np.float32)
                albedo = np.ones((1024, 1024, 3), dtype=np.float32) * np.array(
                    [0.5, 0.5, 0.5]
                )  # default color
            else:
                albedo = cv2.imread(albedo_path, cv2.IMREAD_UNCHANGED)
                albedo = cv2.cvtColor(albedo, cv2.COLOR_BGR2RGB)
                albedo = albedo.astype(np.float32) / 255
                print(f"[load_obj] load texture: {albedo.shape}")

                # import matplotlib.pyplot as plt
                # plt.imshow(albedo)
                # plt.show()

            mesh.albedo = torch.tensor(albedo, dtype=torch.float32, device=device)

        return mesh

    @classmethod
    def load_trimesh(cls, path, device=None):
        mesh = cls()

        # device
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        mesh.device = device

        # use trimesh to load ply/glb, assume only has one single RootMesh...
        _data = trimesh.load(path)
        if isinstance(_data, trimesh.Scene):
            if len(_data.geometry) == 1:
                _mesh = list(_data.geometry.values())[0]
            else:
                # manual concat, will lose texture
                _concat = []
                for g in _data.geometry.values():
                    if isinstance(g, trimesh.Trimesh):
                        _concat.append(g)
                _mesh = trimesh.util.concatenate(_concat)
        else:
            _mesh = _data

        if _mesh.visual.kind == "vertex":
            vertex_colors = _mesh.visual.vertex_colors
            vertex_colors = np.array(vertex_colors[..., :3]).astype(np.float32) / 255
            mesh.vc = torch.tensor(vertex_colors, dtype=torch.float32, device=device)
            print(f"[load_trimesh] use vertex color: {mesh.vc.shape}")
        elif _mesh.visual.kind == "texture":
            _material = _mesh.visual.material
            if isinstance(_material, trimesh.visual.material.PBRMaterial):
                texture = np.array(_material.baseColorTexture).astype(np.float32) / 255
            elif isinstance(_material, trimesh.visual.material.SimpleMaterial):
                texture = (
                    np.array(_material.to_pbr().baseColorTexture).astype(np.float32)
                    / 255
                )
            else:
                raise NotImplementedError(
                    f"material type {type(_material)} not supported!"
                )
            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
            print(f"[load_trimesh] load texture: {texture.shape}")
        else:
            texture = np.ones((1024, 1024, 3), dtype=np.float32) * np.array(
                [0.5, 0.5, 0.5]
            )
            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
            print(f"[load_trimesh] failed to load texture.")

        vertices = _mesh.vertices

        try:
            texcoords = _mesh.visual.uv
            texcoords[:, 1] = 1 - texcoords[:, 1]
        except Exception as e:
            texcoords = None

        try:
            normals = _mesh.vertex_normals
        except Exception as e:
            normals = None

        # trimesh only support vertex uv...
        faces = tfaces = nfaces = _mesh.faces

        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
        mesh.vt = (
            torch.tensor(texcoords, dtype=torch.float32, device=device)
            if texcoords is not None
            else None
        )
        mesh.vn = (
            torch.tensor(normals, dtype=torch.float32, device=device)
            if normals is not None
            else None
        )

        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
        mesh.ft = (
            torch.tensor(tfaces, dtype=torch.int32, device=device)
            if texcoords is not None
            else None
        )
        mesh.fn = (
            torch.tensor(nfaces, dtype=torch.int32, device=device)
            if normals is not None
            else None
        )

        return mesh

    # aabb
    def aabb(self):
        return torch.min(self.v, dim=0).values, torch.max(self.v, dim=0).values

    # unit size
    @torch.no_grad()
    def auto_size(self):
        vmin, vmax = self.aabb()
        self.ori_center = (vmax + vmin) / 2
        self.ori_scale = 1.2 / torch.max(vmax - vmin).item()
        self.v = (self.v - self.ori_center) * self.ori_scale

    def auto_normal(self):
        i0, i1, i2 = self.f[:, 0].long(), self.f[:, 1].long(), self.f[:, 2].long()
        v0, v1, v2 = self.v[i0, :], self.v[i1, :], self.v[i2, :]

        face_normals = torch.cross(v1 - v0, v2 - v0)

        # Splat face normals to vertices
        vn = torch.zeros_like(self.v)
        vn.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
        vn.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
        vn.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)

        # Normalize, replace zero (degenerated) normals with some default value
        vn = torch.where(
            dot(vn, vn) > 1e-20,
            vn,
            torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device),
        )
        vn = safe_normalize(vn)

        self.vn = vn
        self.fn = self.f

    def auto_uv(self, cache_path=None, vmap=True):
        # try to load cache
        if cache_path is not None:
            cache_path = os.path.splitext(cache_path)[0] + "_uv.npz"
        if cache_path is not None and os.path.exists(cache_path):
            data = np.load(cache_path)
            vt_np, ft_np, vmapping = data["vt"], data["ft"], data["vmapping"]
        else:
            import xatlas

            v_np = self.v.detach().cpu().numpy()
            f_np = self.f.detach().int().cpu().numpy()
            atlas = xatlas.Atlas()
            atlas.add_mesh(v_np, f_np)
            chart_options = xatlas.ChartOptions()
            # chart_options.max_iterations = 4
            atlas.generate(chart_options=chart_options)
            vmapping, ft_np, vt_np = atlas[0]  # [N], [M, 3], [N, 2]

            # save to cache
            if cache_path is not None:
                np.savez(cache_path, vt=vt_np, ft=ft_np, vmapping=vmapping)

        vt = torch.from_numpy(vt_np.astype(np.float32)).to(self.device)
        ft = torch.from_numpy(ft_np.astype(np.int32)).to(self.device)
        self.vt = vt
        self.ft = ft

        if vmap:
            # remap v/f to vt/ft, so each v correspond to a unique vt. (necessary for gltf)
            vmapping = (
                torch.from_numpy(vmapping.astype(np.int64)).long().to(self.device)
            )
            self.align_v_to_vt(vmapping)

    def align_v_to_vt(self, vmapping=None):
        # remap v/f and vn/vn to vt/ft.
        if vmapping is None:
            ft = self.ft.view(-1).long()
            f = self.f.view(-1).long()
            vmapping = torch.zeros(
                self.vt.shape[0], dtype=torch.long, device=self.device
            )
            vmapping[ft] = f  # scatter, randomly choose one if index is not unique

        self.v = self.v[vmapping]
        self.f = self.ft
        # assume fn == f
        if self.vn is not None:
            self.vn = self.vn[vmapping]
            self.fn = self.ft

    def to(self, device):
        self.device = device
        for name in ["v", "f", "vn", "fn", "vt", "ft", "albedo"]:
            tensor = getattr(self, name)
            if tensor is not None:
                setattr(self, name, tensor.to(device))
        return self

    def write(self, path):
        if path.endswith(".ply"):
            self.write_ply(path)
        elif path.endswith(".obj"):
            self.write_obj(path)
        elif path.endswith(".glb") or path.endswith(".gltf"):
            self.write_glb(path)
        else:
            raise NotImplementedError(f"format {path} not supported!")

    # write to ply file (only geom)
    def write_ply(self, path):
        v_np = self.v.detach().cpu().numpy()
        f_np = self.f.detach().cpu().numpy()

        _mesh = trimesh.Trimesh(vertices=v_np, faces=f_np)
        _mesh.export(path)

    # write to gltf/glb file (geom + texture)
    def write_glb(self, path):
        assert (
            self.vn is not None and self.vt is not None
        )  # should be improved to support export without texture...

        # assert self.v.shape[0] == self.vn.shape[0] and self.v.shape[0] == self.vt.shape[0]
        if self.v.shape[0] != self.vt.shape[0]:
            self.align_v_to_vt()

        # assume f == fn == ft

        import pygltflib

        f_np = self.f.detach().cpu().numpy().astype(np.uint32)
        v_np = self.v.detach().cpu().numpy().astype(np.float32)
        # vn_np = self.vn.detach().cpu().numpy().astype(np.float32)
        vt_np = self.vt.detach().cpu().numpy().astype(np.float32)

        albedo = self.albedo.detach().cpu().numpy()
        albedo = (albedo * 255).astype(np.uint8)
        albedo = cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR)

        f_np_blob = f_np.flatten().tobytes()
        v_np_blob = v_np.tobytes()
        # vn_np_blob = vn_np.tobytes()
        vt_np_blob = vt_np.tobytes()
        albedo_blob = cv2.imencode(".png", albedo)[1].tobytes()

        gltf = pygltflib.GLTF2(
            scene=0,
            scenes=[pygltflib.Scene(nodes=[0])],
            nodes=[pygltflib.Node(mesh=0)],
            meshes=[
                pygltflib.Mesh(
                    primitives=[
                        pygltflib.Primitive(
                            # indices to accessors (0 is triangles)
                            attributes=pygltflib.Attributes(
                                POSITION=1,
                                TEXCOORD_0=2,
                            ),
                            indices=0,
                            material=0,
                        )
                    ]
                )
            ],
            materials=[
                pygltflib.Material(
                    pbrMetallicRoughness=pygltflib.PbrMetallicRoughness(
                        baseColorTexture=pygltflib.TextureInfo(index=0, texCoord=0),
                        metallicFactor=0.0,
                        roughnessFactor=1.0,
                    ),
                    alphaCutoff=0,
                    doubleSided=True,
                )
            ],
            textures=[
                pygltflib.Texture(sampler=0, source=0),
            ],
            samplers=[
                pygltflib.Sampler(
                    magFilter=pygltflib.LINEAR,
                    minFilter=pygltflib.LINEAR_MIPMAP_LINEAR,
                    wrapS=pygltflib.REPEAT,
                    wrapT=pygltflib.REPEAT,
                ),
            ],
            images=[
                # use embedded (buffer) image
                pygltflib.Image(bufferView=3, mimeType="image/png"),
            ],
            buffers=[
                pygltflib.Buffer(
                    byteLength=len(f_np_blob)
                    + len(v_np_blob)
                    + len(vt_np_blob)
                    + len(albedo_blob)
                )
            ],
            # buffer view (based on dtype)
            bufferViews=[
                # triangles; as flatten (element) array
                pygltflib.BufferView(
                    buffer=0,
                    byteLength=len(f_np_blob),
                    target=pygltflib.ELEMENT_ARRAY_BUFFER,  # GL_ELEMENT_ARRAY_BUFFER (34963)
                ),
                # positions; as vec3 array
                pygltflib.BufferView(
                    buffer=0,
                    byteOffset=len(f_np_blob),
                    byteLength=len(v_np_blob),
                    byteStride=12,  # vec3
                    target=pygltflib.ARRAY_BUFFER,  # GL_ARRAY_BUFFER (34962)
                ),
                # texcoords; as vec2 array
                pygltflib.BufferView(
                    buffer=0,
                    byteOffset=len(f_np_blob) + len(v_np_blob),
                    byteLength=len(vt_np_blob),
                    byteStride=8,  # vec2
                    target=pygltflib.ARRAY_BUFFER,
                ),
                # texture; as none target
                pygltflib.BufferView(
                    buffer=0,
                    byteOffset=len(f_np_blob) + len(v_np_blob) + len(vt_np_blob),
                    byteLength=len(albedo_blob),
                ),
            ],
            accessors=[
                # 0 = triangles
                pygltflib.Accessor(
                    bufferView=0,
                    componentType=pygltflib.UNSIGNED_INT,  # GL_UNSIGNED_INT (5125)
                    count=f_np.size,
                    type=pygltflib.SCALAR,
                    max=[int(f_np.max())],
                    min=[int(f_np.min())],
                ),
                # 1 = positions
                pygltflib.Accessor(
                    bufferView=1,
                    componentType=pygltflib.FLOAT,  # GL_FLOAT (5126)
                    count=len(v_np),
                    type=pygltflib.VEC3,
                    max=v_np.max(axis=0).tolist(),
                    min=v_np.min(axis=0).tolist(),
                ),
                # 2 = texcoords
                pygltflib.Accessor(
                    bufferView=2,
                    componentType=pygltflib.FLOAT,
                    count=len(vt_np),
                    type=pygltflib.VEC2,
                    max=vt_np.max(axis=0).tolist(),
                    min=vt_np.min(axis=0).tolist(),
                ),
            ],
        )

        # set actual data
        gltf.set_binary_blob(f_np_blob + v_np_blob + vt_np_blob + albedo_blob)

        # glb = b"".join(gltf.save_to_bytes())
        gltf.save(path)

    # write to obj file (geom + texture)
    def write_obj(self, path):
        mtl_path = path.replace(".obj", ".mtl")
        albedo_path = path.replace(".obj", "_albedo.png")

        v_np = self.v.detach().cpu().numpy()
        vt_np = self.vt.detach().cpu().numpy() if self.vt is not None else None
        vn_np = self.vn.detach().cpu().numpy() if self.vn is not None else None
        f_np = self.f.detach().cpu().numpy()
        ft_np = self.ft.detach().cpu().numpy() if self.ft is not None else None
        fn_np = self.fn.detach().cpu().numpy() if self.fn is not None else None

        with open(path, "w") as fp:
            fp.write(f"mtllib {os.path.basename(mtl_path)} \n")

            for v in v_np:
                fp.write(f"v {v[0]} {v[1]} {v[2]} \n")

            if vt_np is not None:
                for v in vt_np:
                    fp.write(f"vt {v[0]} {1 - v[1]} \n")

            if vn_np is not None:
                for v in vn_np:
                    fp.write(f"vn {v[0]} {v[1]} {v[2]} \n")

            fp.write(f"usemtl defaultMat \n")
            for i in range(len(f_np)):
                fp.write(
                    f'f {f_np[i, 0] + 1}/{ft_np[i, 0] + 1 if ft_np is not None else ""}/{fn_np[i, 0] + 1 if fn_np is not None else ""} \
                             {f_np[i, 1] + 1}/{ft_np[i, 1] + 1 if ft_np is not None else ""}/{fn_np[i, 1] + 1 if fn_np is not None else ""} \
                             {f_np[i, 2] + 1}/{ft_np[i, 2] + 1 if ft_np is not None else ""}/{fn_np[i, 2] + 1 if fn_np is not None else ""} \n'
                )

        with open(mtl_path, "w") as fp:
            fp.write(f"newmtl defaultMat \n")
            fp.write(f"Ka 1 1 1 \n")
            fp.write(f"Kd 1 1 1 \n")
            fp.write(f"Ks 0 0 0 \n")
            fp.write(f"Tr 1 \n")
            fp.write(f"illum 1 \n")
            fp.write(f"Ns 0 \n")
            fp.write(f"map_Kd {os.path.basename(albedo_path)} \n")

        if not (False or self.albedo is None):
            albedo = self.albedo.detach().cpu().numpy()
            albedo = (albedo * 255).astype(np.uint8)
            cv2.imwrite(albedo_path, cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR))


================================================
FILE: physdreamer/gaussian_3d/scene/mesh_utils.py
================================================
import numpy as np
import pymeshlab as pml


def poisson_mesh_reconstruction(points, normals=None):
    # points/normals: [N, 3] np.ndarray

    import open3d as o3d

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points)

    # outlier removal
    pcd, ind = pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=10)

    # normals
    if normals is None:
        pcd.estimate_normals()
    else:
        pcd.normals = o3d.utility.Vector3dVector(normals[ind])

    # visualize
    o3d.visualization.draw_geometries([pcd], point_show_normal=False)

    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
        pcd, depth=9
    )
    vertices_to_remove = densities < np.quantile(densities, 0.1)
    mesh.remove_vertices_by_mask(vertices_to_remove)

    # visualize
    o3d.visualization.draw_geometries([mesh])

    vertices = np.asarray(mesh.vertices)
    triangles = np.asarray(mesh.triangles)

    print(
        f"[INFO] poisson mesh reconstruction: {points.shape} --> {vertices.shape} / {triangles.shape}"
    )

    return vertices, triangles


def decimate_mesh(
    verts, faces, target, backend="pymeshlab", remesh=False, optimalplacement=True
):
    # optimalplacement: default is True, but for flat mesh must turn False to prevent spike artifect.

    _ori_vert_shape = verts.shape
    _ori_face_shape = faces.shape

    if backend == "pyfqmr":
        import pyfqmr

        solver = pyfqmr.Simplify()
        solver.setMesh(verts, faces)
        solver.simplify_mesh(target_count=target, preserve_border=False, verbose=False)
        verts, faces, normals = solver.getMesh()
    else:
        m = pml.Mesh(verts, faces)
        ms = pml.MeshSet()
        ms.add_mesh(m, "mesh")  # will copy!

        # filters
        # ms.meshing_decimation_clustering(threshold=pml.PercentageValue(1))
        ms.meshing_decimation_quadric_edge_collapse(
            targetfacenum=int(target), optimalplacement=optimalplacement
        )

        if remesh:
            # ms.apply_coord_taubin_smoothing()
            ms.meshing_isotropic_explicit_remeshing(
                iterations=3, targetlen=pml.PercentageValue(1)
            )

        # extract mesh
        m = ms.current_mesh()
        verts = m.vertex_matrix()
        faces = m.face_matrix()

    print(
        f"[INFO] mesh decimation: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
    )

    return verts, faces


def clean_mesh(
    verts,
    faces,
    v_pct=1,
    min_f=64,
    min_d=20,
    repair=True,
    remesh=True,
    remesh_size=0.01,
):
    # verts: [N, 3]
    # faces: [N, 3]

    _ori_vert_shape = verts.shape
    _ori_face_shape = faces.shape

    m = pml.Mesh(verts, faces)
    ms = pml.MeshSet()
    ms.add_mesh(m, "mesh")  # will copy!

    # filters
    ms.meshing_remove_unreferenced_vertices()  # verts not refed by any faces

    if v_pct > 0:
        ms.meshing_merge_close_vertices(
            threshold=pml.PercentageValue(v_pct)
        )  # 1/10000 of bounding box diagonal

    ms.meshing_remove_duplicate_faces()  # faces defined by the same verts
    ms.meshing_remove_null_faces()  # faces with area == 0

    if min_d > 0:
        ms.meshing_remove_connected_component_by_diameter(
            mincomponentdiag=pml.PercentageValue(min_d)
        )

    if min_f > 0:
        ms.meshing_remove_connected_component_by_face_number(mincomponentsize=min_f)

    if repair:
        # ms.meshing_remove_t_vertices(method=0, threshold=40, repeat=True)
        ms.meshing_repair_non_manifold_edges(method=0)
        ms.meshing_repair_non_manifold_vertices(vertdispratio=0)

    if remesh:
        # ms.apply_coord_taubin_smoothing()
        ms.meshing_isotropic_explicit_remeshing(
            iterations=3, targetlen=pml.PureValue(remesh_size)
        )

    # extract mesh
    m = ms.current_mesh()
    verts = m.vertex_matrix()
    faces = m.face_matrix()

    print(
        f"[INFO] mesh cleaning: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
    )

    return verts, faces


================================================
FILE: physdreamer/gaussian_3d/utils/camera_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

from physdreamer.gaussian_3d.scene.cameras import Camera
import numpy as np
from physdreamer.gaussian_3d.utils.general_utils import PILtoTorch
from physdreamer.gaussian_3d.utils.graphics_utils import fov2focal
import torch

WARNED = False


def loadCam(args, id, cam_info, resolution_scale):
    orig_w, orig_h = cam_info.image.size

    if args.resolution in [1, 2, 4, 8]:
        resolution = round(orig_w / (resolution_scale * args.resolution)), round(
            orig_h / (resolution_scale * args.resolution)
        )
    else:  # should be a type that converts to float
        if args.resolution == -1:
            if orig_w > 1600:
                global WARNED
                if not WARNED:
                    print(
                        "[ INFO ] Encountered quite large input images (>1.6K pixels width), rescaling to 1.6K.\n "
                        "If this is not desired, please explicitly specify '--resolution/-r' as 1"
                    )
                    WARNED = True
                global_down = orig_w / 1600
            else:
                global_down = 1
        else:
            global_down = orig_w / args.resolution

        scale = float(global_down) * float(resolution_scale)
        resolution = (int(orig_w / scale), int(orig_h / scale))

    resized_image_rgb = PILtoTorch(cam_info.image, resolution)

    gt_image = resized_image_rgb[:3, ...]
    loaded_mask = None

    if resized_image_rgb.shape[1] == 4:
        loaded_mask = resized_image_rgb[3:4, ...]

    return Camera(
        colmap_id=cam_info.uid,
        R=cam_info.R,
        T=cam_info.T,
        FoVx=cam_info.FovX,
        FoVy=cam_info.FovY,
        image=gt_image,
        gt_alpha_mask=loaded_mask,
        image_name=cam_info.image_name,
        uid=id,
        data_device=args.data_device,
    )


def cameraList_from_camInfos(cam_infos, resolution_scale, args):
    camera_list = []

    for id, c in enumerate(cam_infos):
        camera_list.append(loadCam(args, id, c, resolution_scale))

    return camera_list


def camera_to_JSON(id, camera: Camera):
    Rt = np.zeros((4, 4))
    Rt[:3, :3] = camera.R.transpose()
    Rt[:3, 3] = camera.T
    Rt[3, 3] = 1.0

    W2C = np.linalg.inv(Rt)
    pos = W2C[:3, 3]
    rot = W2C[:3, :3]
    serializable_array_2d = [x.tolist() for x in rot]
    camera_entry = {
        "id": id,
        "img_name": camera.image_name,
        "width": camera.width,
        "height": camera.height,
        "position": pos.tolist(),
        "rotation": serializable_array_2d,
        "fy": fov2focal(camera.FovY, camera.height),
        "fx": fov2focal(camera.FovX, camera.width),
    }
    return camera_entry


def look_at(from_point, to_point, up_vector=(0, 1, 0)):
    """
    Compute the look-at matrix for a camera.

    :param from_point: The position of the camera.
    :param to_point: The point the camera is looking at.
    :param up_vector: The up direction of the camera.
    :return: The 4x4 look-at matrix.
    """

    # minus z for opengl. z for colmap
    forward = np.array(to_point) - np.array(from_point)
    forward = forward / (np.linalg.norm(forward) + 1e-5)

    # x-axis
    # Right direction is the cross product of the forward vector and the up vector
    right = np.cross(up_vector, forward)
    right = right / (np.linalg.norm(right) + 1e-5)

    # y axis
    # True up direction is the cross product of the right vector and the forward vector
    true_up = np.cross(forward, right)
    true_up = true_up / (np.linalg.norm(true_up) + 1e-5)

    # camera to world
    rotation = np.array(
        [
            [right[0], true_up[0], forward[0]],
            [right[1], true_up[1], forward[1]],
            [right[2], true_up[2], forward[2]],
        ]
    )

    # Construct the translation matrix
    translation = np.array(
        [
            [-from_point[0]],
            [-from_point[1]],
            [-from_point[2]],
        ]
    )

    # Combine the rotation and translation to get the look-at matrix
    T = 1.0 * rotation.transpose() @ translation

    return rotation.transpose(), T


def create_cameras_around_sphere(
    radius=6,
    elevation=0,
    fovx=35,
    resolutions=(720, 1080),
    num_cams=60,
    center=(0, 0, 0),
):
    """
    Create cameras around a sphere.

    :param radius: The radius of the circle on which cameras are placed.
    :param elevation: The elevation angle in degrees.
    :param fovx: The horizontal field of view of the cameras.
    :param resolutions: The resolution of the cameras.
    :param num_cams: The number of cameras.
    :param center: The center of the sphere.
    :return: A list of camera extrinsics (world2camera transformations).
    """
    extrinsics = []

    # Convert elevation to radians
    elevation_rad = np.radians(elevation)

    # Compute the y-coordinate of the cameras based on the elevation
    z = radius * np.sin(elevation_rad)

    # Compute the radius of the circle at the given elevation
    circle_radius = radius * np.cos(elevation_rad)

    for i in range(num_cams):
        # Compute the angle for the current camera
        angle = 2 * np.pi * i / num_cams

        # Compute the x and z coordinates of the camera
        x = circle_radius * np.cos(angle) + center[0]
        y = circle_radius * np.sin(angle) + center[1]

        # Create the look-at matrix for the camera
        R, T = look_at((x, y, z + center[2]), center)
        extrinsics.append([R, T.squeeze(axis=-1)])

    cam_list = []
    dummy_image = torch.tensor(
        np.zeros((3, resolutions[0], resolutions[1]), dtype=np.uint8)
    )
    for i in range(num_cams):
        R, T = extrinsics[i]

        # R is stored transposed due to 'glm' in CUDA code
        R = R.transpose()
        cam = Camera(
            colmap_id=i,
            R=R,
            T=T,
            FoVx=fovx,
            FoVy=fovx * resolutions[1] / resolutions[0],
            image_name="",
            uid=i,
            data_device="cuda",
            image=dummy_image,
            gt_alpha_mask=None,
        )

        cam_list.append(cam)

    return cam_list


================================================
FILE: physdreamer/gaussian_3d/utils/general_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import sys
from datetime import datetime
import numpy as np
import random

def inverse_sigmoid(x):
    return torch.log(x/(1-x))

def PILtoTorch(pil_image, resolution):
    resized_image_PIL = pil_image.resize(resolution)
    resized_image = torch.from_numpy(np.array(resized_image_PIL)) / 255.0
    if len(resized_image.shape) == 3:
        return resized_image.permute(2, 0, 1)
    else:
        return resized_image.unsqueeze(dim=-1).permute(2, 0, 1)

def get_expon_lr_func(
    lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000
):
    """
    Copied from Plenoxels

    Continuous learning rate decay function. Adapted from JaxNeRF
    The returned rate is lr_init when step=0 and lr_final when step=max_steps, and
    is log-linearly interpolated elsewhere (equivalent to exponential decay).
    If lr_delay_steps>0 then the learning rate will be scaled by some smooth
    function of lr_delay_mult, such that the initial learning rate is
    lr_init*lr_delay_mult at the beginning of optimization but will be eased back
    to the normal learning rate when steps>lr_delay_steps.
    :param conf: config subtree 'lr' or similar
    :param max_steps: int, the number of steps during optimization.
    :return HoF which takes step as input
    """

    def helper(step):
        if step < 0 or (lr_init == 0.0 and lr_final == 0.0):
            # Disable this parameter
            return 0.0
        if lr_delay_steps > 0:
            # A kind of reverse cosine decay.
            delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
                0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1)
            )
        else:
            delay_rate = 1.0
        t = np.clip(step / max_steps, 0, 1)
        log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
        return delay_rate * log_lerp

    return helper

def strip_lowerdiag(L):
    uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")

    uncertainty[:, 0] = L[:, 0, 0]
    uncertainty[:, 1] = L[:, 0, 1]
    uncertainty[:, 2] = L[:, 0, 2]
    uncertainty[:, 3] = L[:, 1, 1]
    uncertainty[:, 4] = L[:, 1, 2]
    uncertainty[:, 5] = L[:, 2, 2]
    return uncertainty

def strip_symmetric(sym):
    return strip_lowerdiag(sym)

def build_rotation(r):
    norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])

    q = r / norm[:, None]

    R = torch.zeros((q.size(0), 3, 3), device='cuda')

    r = q[:, 0]
    x = q[:, 1]
    y = q[:, 2]
    z = q[:, 3]

    R[:, 0, 0] = 1 - 2 * (y*y + z*z)
    R[:, 0, 1] = 2 * (x*y - r*z)
    R[:, 0, 2] = 2 * (x*z + r*y)
    R[:, 1, 0] = 2 * (x*y + r*z)
    R[:, 1, 1] = 1 - 2 * (x*x + z*z)
    R[:, 1, 2] = 2 * (y*z - r*x)
    R[:, 2, 0] = 2 * (x*z - r*y)
    R[:, 2, 1] = 2 * (y*z + r*x)
    R[:, 2, 2] = 1 - 2 * (x*x + y*y)
    return R

def build_scaling_rotation(s, r):
    L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda")
    R = build_rotation(r)

    L[:,0,0] = s[:,0]
    L[:,1,1] = s[:,1]
    L[:,2,2] = s[:,2]

    L = R @ L
    return L

def safe_state(silent):
    old_f = sys.stdout
    class F:
        def __init__(self, silent):
            self.silent = silent

        def write(self, x):
            if not self.silent:
                if x.endswith("\n"):
                    old_f.write(x.replace("\n", " [{}]\n".format(str(datetime.now().strftime("%d/%m %H:%M:%S")))))
                else:
                    old_f.write(x)

        def flush(self):
            old_f.flush()

    sys.stdout = F(silent)

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)
    torch.cuda.set_device(torch.device("cuda:0"))


================================================
FILE: physdreamer/gaussian_3d/utils/graphics_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import math
import numpy as np
from typing import NamedTuple

class BasicPointCloud(NamedTuple):
    points : np.array
    colors : np.array
    normals : np.array

def geom_transform_points(points, transf_matrix):
    P, _ = points.shape
    ones = torch.ones(P, 1, dtype=points.dtype, device=points.device)
    points_hom = torch.cat([points, ones], dim=1)
    points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0))

    denom = points_out[..., 3:] + 0.0000001
    return (points_out[..., :3] / denom).squeeze(dim=0)

def getWorld2View(R, t):
    Rt = np.zeros((4, 4))
    Rt[:3, :3] = R.transpose()
    Rt[:3, 3] = t
    Rt[3, 3] = 1.0
    return np.float32(Rt)

def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0):
    Rt = np.zeros((4, 4))
    Rt[:3, :3] = R.transpose()
    Rt[:3, 3] = t
    Rt[3, 3] = 1.0

    C2W = np.linalg.inv(Rt)
    cam_center = C2W[:3, 3]
    cam_center = (cam_center + translate) * scale
    C2W[:3, 3] = cam_center
    Rt = np.linalg.inv(C2W)
    return np.float32(Rt)

def getProjectionMatrix(znear, zfar, fovX, fovY):
    tanHalfFovY = math.tan((fovY / 2))
    tanHalfFovX = math.tan((fovX / 2))

    top = tanHalfFovY * znear
    bottom = -top
    right = tanHalfFovX * znear
    left = -right

    P = torch.zeros(4, 4)

    z_sign = 1.0

    P[0, 0] = 2.0 * znear / (right - left)
    P[1, 1] = 2.0 * znear / (top - bottom)
    P[0, 2] = (right + left) / (right - left)
    P[1, 2] = (top + bottom) / (top - bottom)
    P[3, 2] = z_sign
    P[2, 2] = z_sign * zfar / (zfar - znear)
    P[2, 3] = -(zfar * znear) / (zfar - znear)
    return P

def fov2focal(fov, pixels):
    return pixels / (2 * math.tan(fov / 2))

def focal2fov(focal, pixels):
    return 2*math.atan(pixels/(2*focal))

================================================
FILE: physdreamer/gaussian_3d/utils/image_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch

def mse(img1, img2):
    return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)

def psnr(img1, img2):
    mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
    return 20 * torch.log10(1.0 / torch.sqrt(mse))


================================================
FILE: physdreamer/gaussian_3d/utils/loss_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import torch.nn.functional as F
from torch.autograd import Variable
from math import exp

def l1_loss(network_output, gt):
    return torch.abs((network_output - gt)).mean()

def l2_loss(network_output, gt):
    return ((network_output - gt) ** 2).mean()

def gaussian(window_size, sigma):
    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
    return gauss / gauss.sum()

def create_window(window_size, channel):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
    return window

def ssim(img1, img2, window_size=11, size_average=True):
    channel = img1.size(-3)
    window = create_window(window_size, channel)

    if img1.is_cuda:
        window = window.cuda(img1.get_device())
    window = window.type_as(img1)

    return _ssim(img1, img2, window, window_size, channel, size_average)

def _ssim(img1, img2, window, window_size, channel, size_average=True):
    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2

    C1 = 0.01 ** 2
    C2 = 0.03 ** 2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))

    if size_average:
        return ssim_map.mean()
    else:
        return ssim_map.mean(1).mean(1).mean(1)


================================================
FILE: physdreamer/gaussian_3d/utils/rigid_body_utils.py
================================================
import torch
import torch.nn.functional as F


def get_rigid_transform(A, B):
    """
    Estimate the rigid body transformation between two sets of 3D points.
    A and B are Nx3 matrices where each row is a 3D point.
    Returns a rotation matrix R and translation vector t.
    Args:
        A, B: [batch, N, 3] matrix of 3D points
    Outputs:
        R, t: [batch, 3, 3/1]
        target = R @ source (source shape [3, 1]) + t
    """
    assert A.shape == B.shape, "Input matrices must have the same shape"
    assert A.shape[-1] == 3, "Input matrices must have 3 columns (x, y, z coordinates)"

    # Compute centroids. [..., 1, 3]
    centroid_A = torch.mean(A, dim=-2, keepdim=True)
    centroid_B = torch.mean(B, dim=-2, keepdim=True)

    # Center the point sets
    A_centered = A - centroid_A
    B_centered = B - centroid_B

    # Compute the cross-covariance matrix. [..., 3, 3]
    H = A_centered.transpose(-2, -1) @ B_centered

    # Compute the Singular Value Decomposition. Along last two dimensions
    U, S, Vt = torch.linalg.svd(H)

    # Compute the rotation matrix
    R = Vt.transpose(-2, -1) @ U.transpose(-2, -1)

    # Ensure a right-handed coordinate system
    flip_mask = (torch.det(R) < 0) * -2.0 + 1.0
    # Vt[:, 2, :] *= flip_mask[..., None]

    # [N] => [N, 3]
    pad_flip_mask = torch.stack(
        [torch.ones_like(flip_mask), torch.ones_like(flip_mask), flip_mask], dim=-1
    )
    Vt = Vt * pad_flip_mask[..., None]

    # Compute the rotation matrix
    R = Vt.transpose(-2, -1) @ U.transpose(-2, -1)

    # print(R.shape, centroid_A.shape, centroid_B.shape, flip_mask.shape)
    # Compute the translation
    t = centroid_B - (R @ centroid_A.transpose(-2, -1)).transpose(-2, -1)
    t = t.transpose(-2, -1)
    return R, t


def _test_rigid_transform():
    # Example usage:
    A = torch.tensor([[1, 2, 3], [4, 5, 6], [9, 8, 10], [10, -5, 1]]) * 1.0

    R_synthesized = torch.tensor([[1, 0, 0], [0, -1, 0], [0, 0, -1]]) * 1.0
    # init a random rotation matrix:

    B = (R_synthesized @ A.T).T + 2.0  # Just an example offset

    R, t = get_rigid_transform(A[None, ...], B[None, ...])
    print("Rotation matrix R:")
    print(R)
    print("\nTranslation vector t:")
    print(t)


def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
    """
    Returns torch.sqrt(torch.max(0, x))
    but with a zero subgradient where x is 0.
    """
    ret = torch.zeros_like(x)
    positive_mask = x > 0
    ret[positive_mask] = torch.sqrt(x[positive_mask])
    return ret


def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
    """
    from pytorch3d. Based on trace_method like: https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L205
    Convert rotations given as rotation matrices to quaternions.

    Args:
        matrix: Rotation matrices as tensor of shape (..., 3, 3).

    Returns:
        quaternions with real part first, as tensor of shape (..., 4).
    """
    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")

    batch_dim = matrix.shape[:-2]
    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
        matrix.reshape(batch_dim + (9,)), dim=-1
    )

    q_abs = _sqrt_positive_part(
        torch.stack(
            [
                1.0 + m00 + m11 + m22,
                1.0 + m00 - m11 - m22,
                1.0 - m00 + m11 - m22,
                1.0 - m00 - m11 + m22,
            ],
            dim=-1,
        )
    )

    # we produce the desired quaternion multiplied by each of r, i, j, k
    quat_by_rijk = torch.stack(
        [
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
        ],
        dim=-2,
    )

    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
    # the candidate won't be picked.
    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))

    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
    # forall i; we pick the best-conditioned one (with the largest denominator)

    return quat_candidates[
        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
    ].reshape(batch_dim + (4,))


def quternion_to_matrix(r):
    norm = torch.sqrt(
        r[:, 0] * r[:, 0] + r[:, 1] * r[:, 1] + r[:, 2] * r[:, 2] + r[:, 3] * r[:, 3]
    )

    q = r / norm[:, None]

    R = torch.zeros((q.size(0), 3, 3), device="cuda")

    r = q[:, 0]
    x = q[:, 1]
    y = q[:, 2]
    z = q[:, 3]

    R[:, 0, 0] = 1 - 2 * (y * y + z * z)
    R[:, 0, 1] = 2 * (x * y - r * z)
    R[:, 0, 2] = 2 * (x * z + r * y)
    R[:, 1, 0] = 2 * (x * y + r * z)
    R[:, 1, 1] = 1 - 2 * (x * x + z * z)
    R[:, 1, 2] = 2 * (y * z - r * x)
    R[:, 2, 0] = 2 * (x * z - r * y)
    R[:, 2, 1] = 2 * (y * z + r * x)
    R[:, 2, 2] = 1 - 2 * (x * x + y * y)
    return R


def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
    """
    from Pytorch3d
    Convert a unit quaternion to a standard form: one in which the real
    part is non negative.

    Args:
        quaternions: Quaternions with real part first,
            as tensor of shape (..., 4).

    Returns:
        Standardized quaternions as tensor of shape (..., 4).
    """
    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)


def quaternion_multiply(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    """
    From pytorch3d
    Multiply two quaternions.
    Usual torch rules for broadcasting apply.

    Args:
        a: Quaternions as tensor of shape (..., 4), real part first.
        b: Quaternions as tensor of shape (..., 4), real part first.

    Returns:
        The product of a and b, a tensor of quaternions shape (..., 4).
    """
    aw, ax, ay, az = torch.unbind(a, -1)
    bw, bx, by, bz = torch.unbind(b, -1)
    ow = aw * bw - ax * bx - ay * by - az * bz
    ox = aw * bx + ax * bw + ay * bz - az * by
    oy = aw * by - ax * bz + ay * bw + az * bx
    oz = aw * bz + ax * by - ay * bx + az * bw
    ret = torch.stack((ow, ox, oy, oz), -1)
    ret = standardize_quaternion(ret)
    return ret


def _test_matrix_to_quaternion():
    # init a random batch of quaternion
    r = torch.randn((10, 4)).cuda()

    norm = torch.sqrt(
        r[:, 0] * r[:, 0] + r[:, 1] * r[:, 1] + r[:, 2] * r[:, 2] + r[:, 3] * r[:, 3]
    )

    q = r / norm[:, None]

    q = standardize_quaternion(q)

    R = quternion_to_matrix(q)

    I_rec = R @ R.transpose(-2, -1)
    I_rec_error = torch.abs(I_rec - torch.eye(3, device="cuda")[None, ...]).max()

    q_recovered = matrix_to_quaternion(R)
    norm_ = torch.linalg.norm(q_recovered, dim=-1)
    q_recovered = q_recovered / norm_[..., None]
    q_recovered = standardize_quaternion(q_recovered)

    print(q_recovered.shape, q.shape, R.shape)

    rec = (q - q_recovered).abs().max()

    print("rotation to I error:", I_rec_error, "quant rec error: ", rec)


def _test_matrix_to_quaternion_2():
    R = (
        torch.tensor(
            [[[1, 0, 0], [0, -1, 0], [0, 0, -1]], [[1, 0, 0], [0, 0, 1], [0, -1, 0]]]
        )
        * 1.0
    )

    q_rec = matrix_to_quaternion(R.transpose(-2, -1))

    R_rec = quternion_to_matrix(q_rec)

    print(R_rec)


if __name__ == "__main__":
    # _test_rigid_transform()
    _test_matrix_to_quaternion()

    _test_matrix_to_quaternion_2()


================================================
FILE: physdreamer/gaussian_3d/utils/sh_utils.py
================================================
#  Copyright 2021 The PlenOctree Authors.
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#
#  1. Redistributions of source code must retain the above copyright notice,
#  this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright notice,
#  this list of conditions and the following disclaimer in the documentation
#  and/or other materials provided with the distribution.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

import torch

C0 = 0.28209479177387814
C1 = 0.4886025119029199
C2 = [
    1.0925484305920792,
    -1.0925484305920792,
    0.31539156525252005,
    -1.0925484305920792,
    0.5462742152960396
]
C3 = [
    -0.5900435899266435,
    2.890611442640554,
    -0.4570457994644658,
    0.3731763325901154,
    -0.4570457994644658,
    1.445305721320277,
    -0.5900435899266435
]
C4 = [
    2.5033429417967046,
    -1.7701307697799304,
    0.9461746957575601,
    -0.6690465435572892,
    0.10578554691520431,
    -0.6690465435572892,
    0.47308734787878004,
    -1.7701307697799304,
    0.6258357354491761,
]   


def eval_sh(deg, sh, dirs):
    """
    Evaluate spherical harmonics at unit directions
    using hardcoded SH polynomials.
    Works with torch/np/jnp.
    ... Can be 0 or more batch dimensions.
    Args:
        deg: int SH deg. Currently, 0-3 supported
        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
        dirs: jnp.ndarray unit directions [..., 3]
    Returns:
        [..., C]
    """
    assert deg <= 4 and deg >= 0
    coeff = (deg + 1) ** 2
    assert sh.shape[-1] >= coeff

    result = C0 * sh[..., 0]
    if deg > 0:
        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
        result = (result -
                C1 * y * sh[..., 1] +
                C1 * z * sh[..., 2] -
                C1 * x * sh[..., 3])

        if deg > 1:
            xx, yy, zz = x * x, y * y, z * z
            xy, yz, xz = x * y, y * z, x * z
            result = (result +
                    C2[0] * xy * sh[..., 4] +
                    C2[1] * yz * sh[..., 5] +
                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
                    C2[3] * xz * sh[..., 7] +
                    C2[4] * (xx - yy) * sh[..., 8])

            if deg > 2:
                result = (result +
                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
                C3[1] * xy * z * sh[..., 10] +
                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
                C3[5] * z * (xx - yy) * sh[..., 14] +
                C3[6] * x * (xx - 3 * yy) * sh[..., 15])

                if deg > 3:
                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
    return result

def RGB2SH(rgb):
    return (rgb - 0.5) / C0

def SH2RGB(sh):
    return sh * C0 + 0.5

================================================
FILE: physdreamer/gaussian_3d/utils/system_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

from errno import EEXIST
from os import makedirs, path
import os

def mkdir_p(folder_path):
    # Creates a directory. equivalent to using mkdir -p on the command line
    try:
        makedirs(folder_path)
    except OSError as exc: # Python >2.5
        if exc.errno == EEXIST and path.isdir(folder_path):
            pass
        else:
            raise

def searchForMaxIteration(folder):
    saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)]
    return max(saved_iters)


================================================
FILE: physdreamer/losses/smoothness_loss.py
================================================
import torch
from typing import Tuple


def compute_plane_tv(t: torch.Tensor, only_w: bool = False) -> float:
    """Computes total variance across a plane.
    From nerf-studio

    Args:
        t: Plane tensor
        only_w: Whether to only compute total variance across w dimension

    Returns:
        Total variance
    """
    _, h, w = t.shape
    w_tv = torch.square(t[..., :, 1:] - t[..., :, : w - 1]).mean()

    if only_w:
        return w_tv

    h_tv = torch.square(t[..., 1:, :] - t[..., : h - 1, :]).mean()
    return h_tv + w_tv


def compute_plane_smoothness(t: torch.Tensor) -> float:
    """Computes smoothness across the temporal axis of a plane
    From nerf-studio
    Args:
        t: Plane tensor

    Returns:
        Time smoothness
    """
    _, h, _ = t.shape
    # Convolve with a second derivative filter, in the time dimension which is dimension 2
    first_difference = t[..., 1:, :] - t[..., : h - 1, :]  # [c, h-1, w]
    second_difference = (
        first_difference[..., 1:, :] - first_difference[..., : h - 2, :]
    )  # [c, h-2, w]
    # Take the L2 norm of the result
    return torch.square(second_difference).mean()


================================================
FILE: physdreamer/operators/dct.py
================================================
"""
Code from https://github.com/zh217/torch-dct/blob/master/torch_dct/_dct.py
"""
import numpy as np
import torch
import torch.nn as nn


import torch.fft


def dct1_rfft_impl(x):
    return torch.view_as_real(torch.fft.rfft(x, dim=1))


def dct_fft_impl(v):
    return torch.view_as_real(torch.fft.fft(v, dim=1))


def idct_irfft_impl(V):
    return torch.fft.irfft(torch.view_as_complex(V), n=V.shape[1], dim=1)


def dct(x, norm=None):
    """
    Discrete Cosine Transform, Type II (a.k.a. the DCT)

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    if norm is None:
              N-1
    y[k] = 2* sum x[n]*cos(pi*k*(2n+1)/(2*N)), 0 <= k < N.
              n=0

    :param x: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last dimension
    """
    x_shape = x.shape
    N = x_shape[-1]
    x = x.contiguous().view(-1, N)

    v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1)

    Vc = dct_fft_impl(v)

    k = -torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N)
    W_r = torch.cos(k)
    W_i = torch.sin(k)

    V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i

    if norm == "ortho":
        V[:, 0] /= np.sqrt(N) * 2
        V[:, 1:] /= np.sqrt(N / 2) * 2

    V = 2 * V.view(*x_shape)

    return V


def idct(X, norm=None):
    """
    The inverse to DCT-II, which is a scaled Discrete Cosine Transform, Type III

    Our definition of idct is that idct(dct(x)) == x

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param X: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the inverse DCT-II of the signal over the last dimension
    """

    x_shape = X.shape
    N = x_shape[-1]

    X_v = X.contiguous().view(-1, x_shape[-1]) / 2

    if norm == "ortho":
        X_v[:, 0] *= np.sqrt(N) * 2
        X_v[:, 1:] *= np.sqrt(N / 2) * 2

    k = (
        torch.arange(x_shape[-1], dtype=X.dtype, device=X.device)[None, :]
        * np.pi
        / (2 * N)
    )
    W_r = torch.cos(k)
    W_i = torch.sin(k)

    V_t_r = X_v
    V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1)

    V_r = V_t_r * W_r - V_t_i * W_i
    V_i = V_t_r * W_i + V_t_i * W_r

    V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2)

    v = idct_irfft_impl(V)
    x = v.new_zeros(v.shape)
    x[:, ::2] += v[:, : N - (N // 2)]
    x[:, 1::2] += v.flip([1])[:, : N // 2]

    return x.view(*x_shape)


def dct_3d(x, norm=None):
    """
    3-dimentional Discrete Cosine Transform, Type II (a.k.a. the DCT)

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param x: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last 3 dimensions
    """
    X1 = dct(x, norm=norm)
    X2 = dct(X1.transpose(-1, -2), norm=norm)
    X3 = dct(X2.transpose(-1, -3), norm=norm)
    return X3.transpose(-1, -3).transpose(-1, -2)


def idct_3d(X, norm=None):
    """
    The inverse to 3D DCT-II, which is a scaled Discrete Cosine Transform, Type III

    Our definition of idct is that idct_3d(dct_3d(x)) == x

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param X: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last 3 dimensions
    """
    x1 = idct(X, norm=norm)
    x2 = idct(x1.transpose(-1, -2), norm=norm)
    x3 = idct(x2.transpose(-1, -3), norm=norm)
    return x3.transpose(-1, -3).transpose(-1, -2)


def code_test_dct3d():
    # init a tensor of shape [100, 20, 3]
    x = torch.rand(100, 20, 3)

    dct_coef = dct_3d(x, norm="ortho")
    print("inp signal shape: ", x.shape, "  dct coef shape: ", dct_coef.shape)

    x_recon = idct_3d(dct_coef, norm="ortho")
    print("inp signal shape: ", x.shape, "  recon signal shape: ", x_recon.shape)

    print("max error: ", torch.max(torch.abs(x - x_recon)))

    dct_coef[:, 0, :] = 0

    x_recon = idct_3d(dct_coef, norm="ortho")
    print("max error after removing first order: ", torch.max(torch.abs(x - x_recon)))


if __name__ == "__main__":
    code_test_dct3d()


================================================
FILE: physdreamer/operators/np_operators.py
================================================
import torch
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


def feature_map_to_rgb_pca(feature_map):
    """
    Args:
        feature_map: (C, H, W) feature map.
    Outputs:
        rgb_image: (H, W, 3) image.
    """
    # Move feature map to CPU and convert to numpy
    if isinstance(feature_map, torch.Tensor):
        feature_map = feature_map.detach().cpu().numpy()

    H, W = feature_map.shape[1:]
    # Flatten spatial dimensions  # [N, C]
    flattened_map = feature_map.reshape(feature_map.shape[0], -1).T

    # Apply PCA and reduce channel dimension to 3
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(flattened_map)

    # Reshape back to (H, W, 3)
    rgb_image = pca_result.reshape(H, W, 3)

    # Normalize to [0, 1]
    rgb_image = (rgb_image - rgb_image.min()) / (
        rgb_image.max() - rgb_image.min() + 1e-3
    )

    return rgb_image


================================================
FILE: physdreamer/operators/rotation.py
================================================
from typing import Optional

import torch
import torch.nn.functional as F


def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
    """
    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
    using Gram--Schmidt orthogonalization per Section B of [1].
    Args:
        d6: 6D rotation representation, of size (*, 6)

    Returns:
        batch of rotation matrices of size (*, 3, 3)

    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
    On the Continuity of Rotation Representations in Neural Networks.
    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
    Retrieved from http://arxiv.org/abs/1812.07035
    """

    a1, a2 = d6[..., :3], d6[..., 3:]
    b1 = F.normalize(a1, dim=-1)
    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
    b2 = F.normalize(b2, dim=-1)
    b3 = torch.cross(b1, b2, dim=-1)
    return torch.stack((b1, b2, b3), dim=-2)


def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
    """
    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
    by dropping the last row. Note that 6D representation is not unique.
    Args:
        matrix: batch of rotation matrices of size (*, 3, 3)

    Returns:
        6D rotation representation, of size (*, 6)

    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
    On the Continuity of Rotation Representations in Neural Networks.
    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
    Retrieved from http://arxiv.org/abs/1812.07035
    """
    batch_dim = matrix.size()[:-2]
    return matrix[..., :2, :].clone().reshape(batch_dim + (6,))


def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
    """
    Convert rotations given as quaternions to rotation matrices.

    Args:
        quaternions: quaternions with real part first,
            as tensor of shape (..., 4).

    Returns:
        Rotation matrices as tensor of shape (..., 3, 3).
    """
    r, i, j, k = torch.unbind(quaternions, -1)
    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
    two_s = 2.0 / (quaternions * quaternions).sum(-1)

    o = torch.stack(
        (
            1 - two_s * (j * j + k * k),
            two_s * (i * j - k * r),
            two_s * (i * k + j * r),
            two_s * (i * j + k * r),
            1 - two_s * (i * i + k * k),
            two_s * (j * k - i * r),
            two_s * (i * k - j * r),
            two_s * (j * k + i * r),
            1 - two_s * (i * i + j * j),
        ),
        -1,
    )
    return o.reshape(quaternions.shape[:-1] + (3, 3))


def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
    """
    Returns torch.sqrt(torch.max(0, x))
    but with a zero subgradient where x is 0.
    """
    ret = torch.zeros_like(x)
    positive_mask = x > 0
    ret[positive_mask] = torch.sqrt(x[positive_mask])
    return ret


def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
    """
    Convert rotations given as rotation matrices to quaternions.

    Args:
        matrix: Rotation matrices as tensor of shape (..., 3, 3).

    Returns:
        quaternions with real part first, as tensor of shape (..., 4).
    """
    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")

    batch_dim = matrix.shape[:-2]
    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
        matrix.reshape(batch_dim + (9,)), dim=-1
    )

    q_abs = _sqrt_positive_part(
        torch.stack(
            [
                1.0 + m00 + m11 + m22,
                1.0 + m00 - m11 - m22,
                1.0 - m00 + m11 - m22,
                1.0 - m00 - m11 + m22,
            ],
            dim=-1,
        )
    )

    # we produce the desired quaternion multiplied by each of r, i, j, k
    quat_by_rijk = torch.stack(
        [
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
        ],
        dim=-2,
    )

    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
    # the candidate won't be picked.
    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))

    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
    # forall i; we pick the best-conditioned one (with the largest denominator)

    return quat_candidates[
        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
    ].reshape(batch_dim + (4,))


================================================
FILE: physdreamer/utils/camera_utils.py
================================================
import numpy as np


def normalize(x: np.ndarray) -> np.ndarray:
    """Normalization helper function."""
    return x / np.linalg.norm(x)


def viewmatrix(lookdir: np.ndarray, up: np.ndarray, position: np.ndarray) -> np.ndarray:
    """Construct lookat view matrix."""
    vec2 = normalize(lookdir)
    vec0 = normalize(np.cross(up, vec2))
    vec1 = normalize(np.cross(vec2, vec0))
    m = np.stack([vec0, vec1, vec2, position], axis=1)
    return m


def generate_spiral_path(
    pose: np.ndarray,
    radius: float,
    lookat_pt: np.ndarray = np.array([0, 0, 0]),
    up: np.ndarray = np.array([0, 0, 1]),
    n_frames: int = 60,
    n_rots: int = 1,
    y_scale: float = 1.0,
) -> np.ndarray:
    """Calculates a forward facing spiral path for rendering."""
    x_axis = pose[:3, 0]
    y_axis = pose[:3, 1]
    campos = pose[:3, 3]

    render_poses = []
    for theta in np.linspace(0.0, 2 * np.pi * n_rots, n_frames, endpoint=False):
        t = (np.cos(theta) * x_axis + y_scale * np.sin(theta) * y_axis) * radius
        position = campos + t
        z_axis = position - lookat_pt
        new_pose = np.eye(4)
        new_pose[:3] = viewmatrix(z_axis, up, position)
        render_poses.append(new_pose)
    render_poses = np.stack(render_poses, axis=0)
    return render_poses


================================================
FILE: physdreamer/utils/colmap_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import numpy as np
import collections
import struct

CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"])
Camera = collections.namedtuple(
    "Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
}
CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
                         for camera_model in CAMERA_MODELS])
CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
                           for camera_model in CAMERA_MODELS])


def qvec2rotmat(qvec):
    return np.array([
        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])

def rotmat2qvec(R):
    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
    K = np.array([
        [Rxx - Ryy - Rzz, 0, 0, 0],
        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
    eigvals, eigvecs = np.linalg.eigh(K)
    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
    if qvec[0] < 0:
        qvec *= -1
    return qvec

class Image(BaseImage):
    def qvec2rotmat(self):
        return qvec2rotmat(self.qvec)

def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
    """Read and unpack the next bytes from a binary file.
    :param fid:
    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    :param endian_character: Any of {@, =, <, >, !}
    :return: Tuple of read and unpacked values.
    """
    data = fid.read(num_bytes)
    return struct.unpack(endian_character + format_char_sequence, data)

def read_points3D_text(path):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DText(const std::string& path)
        void Reconstruction::WritePoints3DText(const std::string& path)
    """
    xyzs = None
    rgbs = None
    errors = None
    num_points = 0
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                num_points += 1


    xyzs = np.empty((num_points, 3))
    rgbs = np.empty((num_points, 3))
    errors = np.empty((num_points, 1))
    count = 0
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                xyz = np.array(tuple(map(float, elems[1:4])))
                rgb = np.array(tuple(map(int, elems[4:7])))
                error = np.array(float(elems[7]))
                xyzs[count] = xyz
                rgbs[count] = rgb
                errors[count] = error
                count += 1

    return xyzs, rgbs, errors

def read_points3D_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """


    with open(path_to_model_file, "rb") as fid:
        num_points = read_next_bytes(fid, 8, "Q")[0]

        xyzs = np.empty((num_points, 3))
        rgbs = np.empty((num_points, 3))
        errors = np.empty((num_points, 1))

        for p_id in range(num_points):
            binary_point_line_properties = read_next_bytes(
                fid, num_bytes=43, format_char_sequence="QdddBBBd")
            xyz = np.array(binary_point_line_properties[1:4])
            rgb = np.array(binary_point_line_properties[4:7])
            error = np.array(binary_point_line_properties[7])
            track_length = read_next_bytes(
                fid, num_bytes=8, format_char_sequence="Q")[0]
            track_elems = read_next_bytes(
                fid, num_bytes=8*track_length,
                format_char_sequence="ii"*track_length)
            xyzs[p_id] = xyz
            rgbs[p_id] = rgb
            errors[p_id] = error
    return xyzs, rgbs, errors

def read_intrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    cameras = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                camera_id = int(elems[0])
                model = elems[1]
                assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE"
                width = int(elems[2])
                height = int(elems[3])
                params = np.array(tuple(map(float, elems[4:])))
                cameras[camera_id] = Camera(id=camera_id, model=model,
                                            width=width, height=height,
                                            params=params)
    return cameras

def read_extrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    images = {}
    with open(path_to_model_file, "rb") as fid:
        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_reg_images):
            binary_image_properties = read_next_bytes(
                fid, num_bytes=64, format_char_sequence="idddddddi")
            image_id = binary_image_properties[0]
            qvec = np.array(binary_image_properties[1:5])
            tvec = np.array(binary_image_properties[5:8])
            camera_id = binary_image_properties[8]
            image_name = ""
            current_char = read_next_bytes(fid, 1, "c")[0]
            while current_char != b"\x00":   # look for the ASCII 0 entry
                image_name += current_char.decode("utf-8")
                current_char = read_next_bytes(fid, 1, "c")[0]
            num_points2D = read_next_bytes(fid, num_bytes=8,
                                           format_char_sequence="Q")[0]
            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
                                       format_char_sequence="ddq"*num_points2D)
            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
                                   tuple(map(float, x_y_id_s[1::3]))])
            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
            images[image_id] = Image(
                id=image_id, qvec=qvec, tvec=tvec,
                camera_id=camera_id, name=image_name,
                xys=xys, point3D_ids=point3D_ids)
    return images


def read_intrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    cameras = {}
    with open(path_to_model_file, "rb") as fid:
        num_cameras = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_cameras):
            camera_properties = read_next_bytes(
                fid, num_bytes=24, format_char_sequence="iiQQ")
            camera_id = camera_properties[0]
            model_id = camera_properties[1]
            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
            width = camera_properties[2]
            height = camera_properties[3]
            num_params = CAMERA_MODEL_IDS[model_id].num_params
            params = read_next_bytes(fid, num_bytes=8*num_params,
                                     format_char_sequence="d"*num_params)
            cameras[camera_id] = Camera(id=camera_id,
                                        model=model_name,
                                        width=width,
                                        height=height,
                                        params=np.array(params))
        assert len(cameras) == num_cameras
    return cameras


def read_extrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    images = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                image_id = int(elems[0])
                qvec = np.array(tuple(map(float, elems[1:5])))
                tvec = np.array(tuple(map(float, elems[5:8])))
                camera_id = int(elems[8])
                image_name = elems[9]
                elems = fid.readline().split()
                xys = np.column_stack([tuple(map(float, elems[0::3])),
                                       tuple(map(float, elems[1::3]))])
                point3D_ids = np.array(tuple(map(int, elems[2::3])))
                images[image_id] = Image(
                    id=image_id, qvec=qvec, tvec=tvec,
                    camera_id=camera_id, name=image_name,
                    xys=xys, point3D_ids=point3D_ids)
    return images


def read_colmap_bin_array(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py

    :param path: path to the colmap binary file.
    :return: nd array with the floating point values in the value
    """
    with open(path, "rb") as fid:
        width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1,
                                                usecols=(0, 1, 2), dtype=int)
        fid.seek(0)
        num_delimiter = 0
        byte = fid.read(1)
        while True:
            if byte == b"&":
                num_delimiter += 1
                if num_delimiter >= 3:
                    break
            byte = fid.read(1)
        array = np.fromfile(fid, np.float32)
    array = array.reshape((width, height, channels), order="F")
    return np.transpose(array, (1, 0, 2)).squeeze()


================================================
FILE: physdreamer/utils/config.py
================================================
from omegaconf import OmegaConf


def load_config_with_merge(config_path: str):
    cfg = OmegaConf.load(config_path)

    path_ = cfg.get("_base", None)

    if path_ is not None:
        print(f"Merging base config from {path_}")
        cfg = OmegaConf.merge(load_config_with_merge(path_), cfg)
    else:
        return cfg
    return cfg


def merge_without_none(base_cfg, override_cfg):
    for key, value in override_cfg.items():
        if value is not None:
            base_cfg[key] = value
        elif not (key in base_cfg):
            base_cfg[key] = None
    return base_cfg


def create_config(config_path, args, cli_args: list = []):
    """
    Args:
        config_path: path to config file
        args: argparse object with known variables
        cli_args: list of cli args in the format of
            ["lr=0.1", "model.name=alexnet"]
    """
    # recursively merge base config
    cfg = load_config_with_merge(config_path)

    # parse cli args, and merge them into cfg
    cli_conf = OmegaConf.from_cli(cli_args)
    arg_cfg = OmegaConf.create(vars(args))

    # drop None in arg_cfg

    arg_cfg = OmegaConf.merge(arg_cfg, cli_conf)

    # cfg = OmegaConf.merge(cfg, arg_cfg, cli_conf)
    cfg = merge_without_none(cfg, arg_cfg)

    return cfg


================================================
FILE: physdreamer/utils/img_utils.py
================================================
import torch
import torchvision
import cv2
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from math import exp


def make_grid(imgs: torch.Tensor, scale=0.5):
    """
    Args:
        imgs: [B, C, H, W] in [0, 1]
    Output:
        x row of images, and 3 x column of images
        which means 3 x ^ 2 <= B

        img_grid: np.ndarray, [H', W', C]
    """

    B, C, H, W = imgs.shape

    num_row = int(np.sqrt(B / 3))
    if num_row < 1:
        num_row = 1
    num_col = int(np.ceil(B / num_row))

    img_grid = torchvision.utils.make_grid(imgs, nrow=num_col, padding=0)

    img_grid = img_grid.permute(1, 2, 0).cpu().numpy()

    # resize by scale
    img_grid = cv2.resize(img_grid, None, fx=scale, fy=scale)
    return img_grid


def compute_psnr(img1, img2, mask=None):
    """
    Args:
        img1: [B, C, H, W]
        img2: [B, C, H, W]
        mask: [B, 1, H, W] or [1, 1, H, W] or None
    Outs:
        psnr: [B]
    """
    # batch dim is preserved
    if mask is None:
        mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
    else:
        if mask.shape[0] != img1.shape[0]:
            mask = mask.repeat(img1.shape[0], 1, 1, 1)
        if mask.shape[1] != img1.shape[1]:
            mask = mask.repeat(1, img1.shape[1], 1, 1)

        diff = ((img1 - img2)) ** 2
        diff = diff * mask
        mse = diff.view(img1.shape[0], -1).sum(1, keepdim=True) / (
            mask.view(img1.shape[0], -1).sum(1, keepdim=True) + 1e-8
        )

    return 20 * torch.log10(1.0 / torch.sqrt(mse))


def torch_rgb_to_gray(image):
    # image is [B, C, H, W]
    gray_image = (
        0.299 * image[:, 0, :, :]
        + 0.587 * image[:, 1, :, :]
        + 0.114 * image[:, 2, :, :]
    )
    gray_image = gray_image.unsqueeze(1)

    return gray_image


def compute_gradient_loss(pred, gt, mask=None):
    """
    Args:
        pred: [B, C, H, W]
        gt: [B, C, H, W]
        mask: [B, 1, H, W] or None
    """
    assert pred.shape == gt.shape, "a and b must have the same shape"

    pred = torch_rgb_to_gray(pred)
    gt = torch_rgb_to_gray(gt)

    sobel_kernel_x = torch.tensor(
        [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=pred.dtype, device=pred.device
    )
    sobel_kernel_y = torch.tensor(
        [[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=pred.dtype, device=pred.device
    )

    gradient_a_x = (
        torch.nn.functional.conv2d(
            pred.repeat(1, 3, 1, 1),
            sobel_kernel_x.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    gradient_a_y = (
        torch.nn.functional.conv2d(
            pred.repeat(1, 3, 1, 1),
            sobel_kernel_y.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    # gradient_a_magnitude = torch.sqrt(gradient_a_x ** 2 + gradient_a_y ** 2)

    gradient_b_x = (
        torch.nn.functional.conv2d(
            gt.repeat(1, 3, 1, 1),
            sobel_kernel_x.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    gradient_b_y = (
        torch.nn.functional.conv2d(
            gt.repeat(1, 3, 1, 1),
            sobel_kernel_y.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    # gradient_b_magnitude = torch.sqrt(gradient_b_x ** 2 + gradient_b_y ** 2)

    pred_grad = torch.cat([gradient_a_x, gradient_a_y], dim=1)
    gt_grad = torch.cat([gradient_b_x, gradient_b_y], dim=1)

    if mask is None:
        gradient_difference = torch.abs(pred_grad - gt_grad).mean()
    else:
        gradient_difference = torch.abs(pred_grad - gt_grad).mean(dim=1, keepdim=True)[
            mask
        ].sum() / (mask.sum() + 1e-8)

    return gradient_difference


def mark_image_with_red_squares(img):
    # img, torch.Tensor of shape [B, H, W, C]

    mark_color = torch.tensor([1.0, 0, 0], dtype=torch.float32)

    for x_offset in range(4):
        for y_offset in range(4):
            img[:, x_offset::16, y_offset::16, :] = mark_color

    return img


# below for compute batched SSIM
def gaussian(window_size, sigma):

    gauss = torch.Tensor(
        [
            exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2))
            for x in range(window_size)
        ]
    )
    return gauss / gauss.sum()


def create_window(window_size, channel):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = Variable(
        _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    )
    return window


def compute_ssim(img1, img2, window_size=11, size_average=True):
    channel = img1.size(-3)
    window = create_window(window_size, channel)

    if img1.is_cuda:
        window = window.cuda(img1.get_device())
    window = window.type_as(img1)

    return _ssim(img1, img2, window, window_size, channel, size_average)


def _ssim(img1, img2, window, window_size, channel, size_average=True):
    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = (
        F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
    )
    sigma2_sq = (
        F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
    )
    sigma12 = (
        F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
        - mu1_mu2
    )

    C1 = 0.01**2
    C2 = 0.03**2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
        (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
    )

    if size_average:
        return ssim_map.mean()
    else:
        return ssim_map.mean(1).mean(1).mean(1)


# above for compute batched SSIM


def compute_low_res_psnr(img1, img2, scale_factor):
    """
    Args:
        img1: [B, C, H, W]
        img2: [B, C, H, W]
        scale_factor: int
    """
    img1 = F.interpolate(
        img1, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    img2 = F.interpolate(
        img2, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    return compute_psnr(img1, img2)


def compute_low_res_mse(img1, img2, scale_factor):
    """
    Args:
        img1: [B, C, H, W]
        img2: [B, C, H, W]
        scale_factor: int
    """
    img1 = F.interpolate(
        img1, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    img2 = F.interpolate(
        img2, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    loss_mse = F.mse_loss(img1, img2, reduction="mean")
    return loss_mse


================================================
FILE: physdreamer/utils/io_utils.py
================================================
import cv2
import imageio
import numpy as np
import mediapy
import os
import PIL


def read_video_cv2(video_path, rgb=True):
    """Read video using cv2, return [T, 3, H, W] array, fps"""

    # BGR
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    num_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    ret_list = []
    for i in range(num_frame):
        ret, frame = cap.read()
        if ret:
            if rgb:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = np.transpose(frame, [2, 0, 1])  # [3, H, W]
            ret_list.append(frame[np.newaxis, ...])
        else:
            break
    cap.release()
    ret_array = np.concatenate(ret_list, axis=0)  # [T, 3, H, W]
    return ret_array, fps


def save_video_cv2(video_path, img_list, fps):
    # BGR

    if len(img_list) == 0:
        return
    h, w = img_list[0].shape[:2]
    fourcc = cv2.VideoWriter_fourcc(
        *"mp4v"
    )  # cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
    writer = cv2.VideoWriter(video_path, fourcc, fps, (w, h))

    for frame in img_list:
        writer.write(frame)
    writer.release()


def save_video_imageio(video_path, img_list, fps):
    """
    Img_list: [[H, W, 3]]
    """
    if len(img_list) == 0:
        return
    writer = imageio.get_writer(video_path, fps=fps)
    for frame in img_list:
        writer.append_data(frame)

    writer.close()


def save_gif_imageio(video_path, img_list, fps):
    """
    Img_list: [[H, W, 3]]
    """
    if len(img_list) == 0:
        return
    assert video_path.endswith(".gif")

    imageio.mimsave(video_path, img_list, format="GIF", fps=fps)


def save_video_mediapy(video_frames, output_video_path: str = None, fps: int = 14):
    # video_frames: [N, H, W, 3]
    if isinstance(video_frames[0], PIL.Image.Image):
        video_frames = [np.array(frame) for frame in video_frames]
    os.makedirs(os.path.dirname(output_video_path), exist_ok=True)
    mediapy.write_video(output_video_path, video_frames, fps=fps, qp=18)


================================================
FILE: physdreamer/utils/optimizer.py
================================================
import torch
from torch.optim.lr_scheduler import LambdaLR


def get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps, last_epoch=-1
):
    """
    From diffusers.optimization
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0,
            float(num_training_steps - current_step)
            / float(max(1, num_training_steps - num_warmup_steps)),
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)


================================================
FILE: physdreamer/utils/print_utils.py
================================================
import torch.distributed as dist


def print_if_zero_rank(s):
    if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0):
        print("### " + s)


================================================
FILE: physdreamer/utils/pytorch_mssim.py
================================================
import torch
import torch.nn.functional as F
from math import exp
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def gaussian(window_size, sigma):
    gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
    return gauss/gauss.sum()


def create_window(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0).to(device)
    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    return window

def create_window_3d(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t())
    _3D_window = _2D_window.unsqueeze(2) @ (_1D_window.t())
    window = _3D_window.expand(1, channel, window_size, window_size, window_size).contiguous().to(device)
    return window


def ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
    if val_range is None:
        if torch.max(img1) > 128:
            max_val = 255
        else:
            max_val = 1

        if torch.min(img1) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range

    padd = 0
    (_, channel, height, width) = img1.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window(real_size, channel=channel).to(img1.device)
    
    # mu1 = F.conv2d(img1, window, padding=padd, groups=channel)
    # mu2 = F.conv2d(img2, window, padding=padd, groups=channel)
    mu1 = F.conv2d(F.pad(img1, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)
    mu2 = F.conv2d(F.pad(img2, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(F.pad(img1 * img1, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(F.pad(img2 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu2_sq
    sigma12 = F.conv2d(F.pad(img1 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)  # contrast sensitivity

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean()
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret


def ssim_matlab(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
    """
    Args:
        img1, img2: (N, C, H, W)
    """
    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
    if val_range is None:
        if torch.max(img1) > 128:
            max_val = 255
        else:
            max_val = 1

        if torch.min(img1) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range

    padd = 0
    (_, _, height, width) = img1.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window_3d(real_size, channel=1).to(img1.device)
        # Channel is set to 1 since we consider color images as volumetric images

    img1 = img1.unsqueeze(1)
    img2 = img2.unsqueeze(1)

    mu1 = F.conv3d(F.pad(img1, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)
    mu2 = F.conv3d(F.pad(img2, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv3d(F.pad(img1 * img1, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_sq
    sigma2_sq = F.conv3d(F.pad(img2 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu2_sq
    sigma12 = F.conv3d(F.pad(img1 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)  # contrast sensitivity

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean()
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret


def msssim(img1, img2, window_size=11, size_average=True, val_range=None, normalize=False):
    device = img1.device
    weights = torch.FloatTensor([0.0448, 0.2856, 0.3001, 0.2363, 0.1333]).to(device)
    levels = weights.size()[0]
    mssim = []
    mcs = []
    for _ in range(levels):
        sim, cs = ssim(img1, img2, window_size=window_size, size_average=size_average, full=True, val_range=val_range)
        mssim.append(sim)
        mcs.append(cs)

        img1 = F.avg_pool2d(img1, (2, 2))
        img2 = F.avg_pool2d(img2, (2, 2))

    mssim = torch.stack(mssim)
    mcs = torch.stack(mcs)

    # Normalize (to avoid NaNs during training unstable models, not compliant with original definition)
    if normalize:
        mssim = (mssim + 1) / 2
        mcs = (mcs + 1) / 2

    pow1 = mcs ** weights
    pow2 = mssim ** weights
    # From Matlab implementation https://ece.uwaterloo.ca/~z70wang/research/iwssim/
    output = torch.prod(pow1[:-1] * pow2[-1])
    return output


# Classes to re-use window
class SSIM(torch.nn.Module):
    def __init__(self, window_size=11, size_average=True, val_range=None):
        super(SSIM, self).__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.val_range = val_range

        # Assume 3 channel for SSIM
        self.channel = 3
        self.window = create_window(window_size, channel=self.channel)

    def forward(self, img1, img2):
        (_, channel, _, _) = img1.size()

        if channel == self.channel and self.window.dtype == img1.dtype:
            window = self.window
        else:
            window = create_window(self.window_size, channel).to(img1.device).type(img1.dtype)
            self.window = window
            self.channel = channel

        _ssim = ssim(img1, img2, window=window, window_size=self.window_size, size_average=self.size_average)
        dssim = (1 - _ssim) / 2
        return dssim

class MSSSIM(torch.nn.Module):
    def __init__(self, window_size=11, size_average=True, channel=3):
        super(MSSSIM, self).__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.channel = channel

    def forward(self, img1, img2):
        return msssim(img1, img2, window_size=self.window_size, size_average=self.size_average)

================================================
FILE: physdreamer/utils/svd_helpper.py
================================================
from glob import glob
from sys import version
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import torch
import os

from omegaconf import ListConfig, OmegaConf
from safetensors.torch import load_file as load_safetensors

from sgm.inference.helpers import embed_watermark
from sgm.modules.diffusionmodules.guiders import LinearPredictionGuider, VanillaCFG
from sgm.util import append_dims, default, instantiate_from_config
import math
from einops import repeat


def init_st(version_dict, load_ckpt=True, load_filter=True):
    state = dict()
    if not "model" in state:
        config = version_dict["config"]
        ckpt = version_dict["ckpt"]

        config = OmegaConf.load(config)
        model, msg = load_model_from_config(config, ckpt if load_ckpt else None)

        state["msg"] = msg
        state["model"] = model
        state["ckpt"] = ckpt if load_ckpt else None
        state["config"] = config
        if load_filter:
            return state
            # from scripts.util.detection.nsfw_and_watermark_dectection import DeepFloydDataFiltering
            state["filter"] = DeepFloydDataFiltering(verbose=False)
    return state


def load_model_from_config(config, ckpt=None, verbose=True):
    model = instantiate_from_config(config.model)

    if ckpt is not None:
        print(f"Loading model from {ckpt}")
        if ckpt.endswith("ckpt"):
            pl_sd = torch.load(ckpt, map_location="cpu")
            if "global_step" in pl_sd:
                global_step = pl_sd["global_step"]
                print(f"Global Step: {pl_sd['global_step']}")
            sd = pl_sd["state_dict"]
        elif ckpt.endswith("safetensors"):
            sd = load_safetensors(ckpt)
        else:
            raise NotImplementedError

        msg = None

        m, u = model.load_state_dict(sd, strict=False)

        if len(m) > 0 and verbose:
            print("missing keys:")
            print(m)
        if len(u) > 0 and verbose:
            print("unexpected keys:")
            print(u)
    else:
        msg = None

    model = initial_model_load(model)
    # model.eval()  # ?
    return model, msg


def load_model(model):
    model.cuda()


lowvram_mode = False


def set_lowvram_mode(mode):
    global lowvram_mode
    lowvram_mode = mode


def initial_model_load(model):
    global lowvram_mode
    if lowvram_mode:
        model.model.half()
    else:
        model.cuda()
    return model


def unload_model(model):
    global lowvram_mode
    if lowvram_mode:
        model.cpu()
        torch.cuda.empty_cache()


def get_unique_embedder_keys_from_conditioner(conditioner):
    return list(set([x.input_key for x in conditioner.embedders]))


def get_batch(keys, value_dict, N, T, device):
    batch = {}
    batch_uc = {}

    for key in keys:
        if key == "fps_id":
            batch[key] = (
                torch.tensor([value_dict["fps_id"]])
                .to(device)
                .repeat(int(math.prod(N)))
            )
        elif key == "motion_bucket_id":
            batch[key] = (
                torch.tensor([value_dict["motion_bucket_id"]])
                .to(device)
                .repeat(int(math.prod(N)))
            )
        elif key == "cond_aug":
            batch[key] = repeat(
                torch.tensor([value_dict["cond_aug"]]).to(device),
                "1 -> b",
                b=math.prod(N),
            )
        elif key == "cond_frames":
            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
        elif key == "cond_frames_without_noise":
            batch[key] = repeat(
                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
            )
        else:
            batch[key] = value_dict[key]

    if T is not None:
        batch["num_video_frames"] = T

    for key in batch.keys():
        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
            batch_uc[key] = torch.clone(batch[key])
    return batch, batch_uc


if __name__ == "__main__":
    pass


================================================
FILE: physdreamer/utils/torch_utils.py
================================================
import torch
import time


def get_sync_time():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    return time.time()


================================================
FILE: physdreamer/warp_mpm/README.md
================================================
This folder is mainly copy paste from  https://github.com/zeshunzong/warp-mpm

The biggest change is to make some operations during simulation **non-inplace**, and save the intermediate state during simulation, otherwise gradient computed by warp would be wrong. 


================================================
FILE: physdreamer/warp_mpm/gaussian_sim_utils.py
================================================
import numpy as np


def get_volume(xyzs: np.ndarray, resolution=128) -> np.ndarray:

    # set a grid in the range of [-1, 1], with resolution
    voxel_counts = np.zeros((resolution, resolution, resolution))

    points_xyzindex = ((xyzs + 1) / 2 * (resolution - 1)).astype(np.uint32)
    cell_volume = (2.0 / (resolution - 1)) ** 3

    for x, y, z in points_xyzindex:
        voxel_counts[x, y, z] += 1

    points_number_in_corresponding_voxel = voxel_counts[
        points_xyzindex[:, 0], points_xyzindex[:, 1], points_xyzindex[:, 2]
    ]

    points_volume = cell_volume / points_number_in_corresponding_voxel

    points_volume = points_volume.astype(np.float32)

    # some statistics
    num_non_empyt_voxels = np.sum(voxel_counts > 0)
    max_points_in_voxel = np.max(voxel_counts)
    min_points_in_voxel = np.min(voxel_counts)
    avg_points_in_voxel = np.sum(voxel_counts) / num_non_empyt_voxels
    print("Number of non-empty voxels: ", num_non_empyt_voxels)
    print("Max points in voxel: ", max_points_in_voxel)
    print("Min points in voxel: ", min_points_in_voxel)
    print("Avg points in voxel: ", avg_points_in_voxel)

    return points_volume


================================================
FILE: physdreamer/warp_mpm/mpm_data_structure.py
================================================
import warp as wp
import warp.torch
import torch
from typing import Optional, Union, Sequence, Any
from torch import Tensor
import os
import sys

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from warp_utils import from_torch_safe


@wp.struct
class MPMStateStruct(object):
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_stress: wp.array(dtype=wp.mat33)  # Kirchoff stress, elastic stress
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume
    particle_mass: wp.array(dtype=float)  # mass
    particle_density: wp.array(dtype=float)  # density

    particle_selection: wp.array(
        dtype=int
    )  # only particle_selection[p] = 0 will be simulated

    # grid
    grid_m: wp.array(dtype=float, ndim=3)
    grid_v_in: wp.array(dtype=wp.vec3, ndim=3)  # grid node momentum/velocity
    grid_v_out: wp.array(
        dtype=wp.vec3, ndim=3
    )  # grid node momentum/velocity, after grid update

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        # shape default is int. number of particles
        self.particle_x = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_v = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_F = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=False
        )

        self.particle_F_trial = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_stress = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_C = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_vol = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_mass = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_density = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )

        self.particle_selection = wp.zeros(
            shape, dtype=int, device=device, requires_grad=False
        )

        # grid: will init later
        self.grid_m = wp.zeros(
            (10, 10, 10), dtype=float, device=device, requires_grad=requires_grad
        )
        self.grid_v_in = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.grid_v_out = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )

    def init_grid(
        self, grid_res: int, device: wp.context.Devicelike = None, requires_grad=False
    ):
        self.grid_m = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=float,
            device=device,
            requires_grad=False,
        )
        self.grid_v_in = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )
        self.grid_v_out = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )

    def from_torch(
        self,
        tensor_x: Tensor,
        tensor_volume: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        n_grid: int = 100,
        grid_lim=1.0,
        device="cuda:0",
        requires_grad=True,
    ):
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]
        self.init_grid(grid_res=n_grid, device=device, requires_grad=requires_grad)

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_volume is not None:
            print(self.particle_vol.shape, tensor_volume.shape)
            volume_numpy = tensor_volume.detach().cpu().numpy()
            self.particle_vol = wp.from_numpy(
                volume_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", n_particles)

    def reset_state(
        self,
        tensor_x: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        tensor_density: Optional[Tensor] = None,
        selection_mask: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
    ):
        # reset p_c, p_v, p_C, p_F_trial
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]

        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_density is not None and selection_mask is not None:
            wp_density = from_torch_safe(
                tensor_density.contiguous().detach().clone(),
                dtype=wp.float32,
                requires_grad=False,
            )
            # 1 indicate we need to simulate this particle
            wp_selection_mask = from_torch_safe(
                selection_mask.contiguous().detach().clone().type(torch.int),
                dtype=wp.int32,
                requires_grad=False,
            )

            wp.launch(
                kernel=set_float_vec_to_vec_wmask,
                dim=n_particles,
                inputs=[self.particle_density, wp_density, wp_selection_mask],
                device=device,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_C],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_stress],
            device=device,
        )

    def continue_from_torch(
        self,
        tensor_x: Tensor,
        tensor_velocity: Optional[Tensor] = None,
        tensor_F: Optional[Tensor] = None,
        tensor_C: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
    ):
        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_F is not None:
            self.particle_F_trial = from_torch_safe(
                tensor_F.contiguous().detach().clone(),
                dtype=wp.mat33,
                requires_grad=requires_grad,
            )

        if tensor_C is not None:
            self.particle_C = from_torch_safe(
                tensor_C.contiguous().detach().clone(),
                dtype=wp.mat33,
                requires_grad=requires_grad,
            )

    def set_require_grad(self, requires_grad=True):
        self.particle_x.requires_grad = requires_grad
        self.particle_v.requires_grad = requires_grad
        self.particle_F.requires_grad = requires_grad
        self.particle_F_trial.requires_grad = requires_grad
        self.particle_stress.requires_grad = requires_grad
        self.particle_C.requires_grad = requires_grad

        self.grid_v_out.requires_grad = requires_grad
        self.grid_v_in.requires_grad = requires_grad

    def reset_density(
        self,
        tensor_density: Tensor,
        selection_mask: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
        update_mass=False,
    ):
        n_particles = tensor_density.shape[0]
        if tensor_density is not None:
            wp_density = from_torch_safe(
                tensor_density.contiguous().detach().clone(),
                dtype=wp.float32,
                requires_grad=False,
            )
        
        if selection_mask is not None:
            # 1 indicate we need to simulate this particle
            wp_selection_mask = from_torch_safe(
                selection_mask.contiguous().detach().clone().type(torch.int),
                dtype=wp.int32,
                requires_grad=False,
            )

            wp.launch(
                kernel=set_float_vec_to_vec_wmask,
                dim=n_particles,
                inputs=[self.particle_density, wp_density, wp_selection_mask],
                device=device,
            )
        else:
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=n_particles,
                inputs=[self.particle_density, wp_density],
                device=device,
            )

        if update_mass:
            num_particles = self.particle_x.shape[0]
            wp.launch(
                kernel=get_float_array_product,
                dim=num_particles,
                inputs=[
                    self.particle_density,
                    self.particle_vol,
                    self.particle_mass,
                ],
                device=device,
            )

    def partial_clone(self, device="cuda:0", requires_grad=True):
        new_state = MPMStateStruct()
        n_particles = self.particle_x.shape[0]
        new_state.init(n_particles, device=device, requires_grad=requires_grad)

        # clone section:
        # new_state.particle_vol = wp.clone(self.particle_vol, requires_grad=False)
        # new_state.particle_density = wp.clone(self.particle_density, requires_grad=False)
        # new_state.particle_mass = wp.clone(self.particle_mass, requires_grad=False)

        # new_state.particle_selection = wp.clone(self.particle_selection, requires_grad=False)

        wp.copy(new_state.particle_vol, self.particle_vol)
        wp.copy(new_state.particle_density, self.particle_density)
        wp.copy(new_state.particle_mass, self.particle_mass)
        wp.copy(new_state.particle_selection, self.particle_selection)

        # init grid to zero with grid res.
        new_state.init_grid(
            grid_res=self.grid_v_in.shape[0], device=device, requires_grad=requires_grad
        )

        # init some matrix to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[new_state.particle_F_trial],
            device=device,
        )

        new_state.set_require_grad(requires_grad=requires_grad)
        return new_state


@wp.struct
class MPMModelStruct(object):
    ####### essential #######
    grid_lim: float
    n_particles: int
    n_grid: int
    dx: float
    inv_dx: float
    grid_dim_x: int
    grid_dim_y: int
    grid_dim_z: int
    mu: wp.array(dtype=float)
    lam: wp.array(dtype=float)
    E: wp.array(dtype=float)
    nu: wp.array(dtype=float)
    material: int

    ######## for plasticity ####
    yield_stress: wp.array(dtype=float)
    friction_angle: float
    alpha: float
    gravitational_accelaration: wp.vec3
    hardening: float
    xi: float
    plastic_viscosity: float
    softening: float

    ####### for damping
    rpic_damping: float
    grid_v_damping_scale: float

    ####### for PhysGaussian: covariance
    update_cov_with_F: int

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        self.E = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # young's modulus
        self.nu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # poisson's ratio

        self.mu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )
        self.lam = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.yield_stress = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

    def finalize_mu_lam(self, n_particles, device="cuda:0"):
        wp.launch(
            kernel=compute_mu_lam_from_E_nu_clean,
            dim=n_particles,
            inputs=[self.mu, self.lam, self.E, self.nu],
            device=device,
        )

    def init_other_params(self, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.grid_lim = grid_lim
        self.n_grid = n_grid
        self.grid_dim_x = n_grid
        self.grid_dim_y = n_grid
        self.grid_dim_z = n_grid
        (
            self.dx,
            self.inv_dx,
        ) = self.grid_lim / self.n_grid, float(
            n_grid / grid_lim
        )  # [0-1]?

        self.update_cov_with_F = False

        # material is used to switch between different elastoplastic models. 0 is jelly
        self.material = 0

        self.plastic_viscosity = 0.0
        self.softening = 0.1
        self.friction_angle = 25.0
        sin_phi = wp.sin(self.friction_angle / 180.0 * 3.14159265)
        self.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        self.gravitational_accelaration = wp.vec3(0.0, 0.0, 0.0)

        self.rpic_damping = 0.0  # 0.0 if no damping (apic). -1 if pic

        self.grid_v_damping_scale = 1.1  # globally applied

    def from_torch(
        self, tensor_E: Tensor, tensor_nu: Tensor, device="cuda:0", requires_grad=False
    ):
        self.E = wp.from_torch(tensor_E.contiguous(), requires_grad=requires_grad)
        self.nu = wp.from_torch(tensor_nu.contiguous(), requires_grad=requires_grad)
        n_particles = tensor_E.shape[0]
        self.finalize_mu_lam(n_particles=n_particles, device=device)

    def set_require_grad(self, requires_grad=True):
        self.E.requires_grad = requires_grad
        self.nu.requires_grad = requires_grad
        self.mu.requires_grad = requires_grad
        self.lam.requires_grad = requires_grad


# for various boundary conditions
@wp.struct
class Dirichlet_collider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float

    friction: float
    surface_type: int

    velocity: wp.vec3

    threshold: float
    reset: int
    index: int

    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    height: float
    length: float
    R: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3
    half_height_and_radius: wp.vec2


@wp.struct
class GridCollider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float
    mask: wp.array(dtype=int, ndim=3)


@wp.struct
class Impulse_modifier:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    force: wp.vec3
    forceTimesDt: wp.vec3
    numsteps: int

    point: wp.vec3
    size: wp.vec3
    mask: wp.array(dtype=int)


@wp.struct
class MPMtailoredStruct:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    friction: float
    surface_type: int
    velocity: wp.vec3
    threshold: float
    reset: int

    point_rotate: wp.vec3
    normal_rotate: wp.vec3
    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    point_plane: wp.vec3
    normal_plane: wp.vec3
    velocity_plane: wp.vec3
    threshold_plane: float


@wp.struct
class MaterialParamsModifier:
    point: wp.vec3
    size: wp.vec3
    E: float
    nu: float
    density: float


@wp.struct
class ParticleVelocityModifier:
    point: wp.vec3
    normal: wp.vec3
    half_height_and_radius: wp.vec2
    rotation_scale: float
    translation_scale: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3

    start_time: float

    end_time: float

    velocity: wp.vec3

    mask: wp.array(dtype=int)


@wp.kernel
def compute_mu_lam_from_E_nu_clean(
    mu: wp.array(dtype=float),
    lam: wp.array(dtype=float),
    E: wp.array(dtype=float),
    nu: wp.array(dtype=float),
):
    p = wp.tid()
    mu[p] = E[p] / (2.0 * (1.0 + nu[p]))
    lam[p] = E[p] * nu[p] / ((1.0 + nu[p]) * (1.0 - 2.0 * nu[p]))


@wp.kernel
def set_vec3_to_zero(target_array: wp.array(dtype=wp.vec3)):
    tid = wp.tid()
    target_array[tid] = wp.vec3(0.0, 0.0, 0.0)


@wp.kernel
def set_vec3_to_vec3(
    source_array: wp.array(dtype=wp.vec3), target_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    source_array[tid] = target_array[tid]


@wp.kernel
def set_float_vec_to_vec_wmask(
    source_array: wp.array(dtype=float),
    target_array: wp.array(dtype=float),
    selection_mask: wp.array(dtype=int),
):
    tid = wp.tid()
    if selection_mask[tid] == 1:
        source_array[tid] = target_array[tid]


@wp.kernel
def set_float_vec_to_vec(
    source_array: wp.array(dtype=float), target_array: wp.array(dtype=float)
):
    tid = wp.tid()
    source_array[tid] = target_array[tid]


@wp.kernel
def set_mat33_to_identity(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)


@wp.kernel
def set_mat33_to_zero(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


@wp.kernel
def add_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.add(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def subtract_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.sub(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def add_vec3_to_vec3(
    first_array: wp.array(dtype=wp.vec3), second_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    first_array[tid] = wp.add(first_array[tid], second_array[tid])


@wp.kernel
def set_value_to_float_array(target_array: wp.array(dtype=float), value: float):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def set_warpvalue_to_float_array(
    target_array: wp.array(dtype=float), value: warp.types.float32
):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def get_float_array_product(
    arrayA: wp.array(dtype=float),
    arrayB: wp.array(dtype=float),
    arrayC: wp.array(dtype=float),
):
    tid = wp.tid()
    arrayC[tid] = arrayA[tid] * arrayB[tid]


def torch2warp_quat(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 4
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.quat,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_float(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=warp.types.float32,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_vec3(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.vec3,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_mat33(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.mat33,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


================================================
FILE: physdreamer/warp_mpm/mpm_solver_diff.py
================================================
import sys
import os

import warp as wp

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from mpm_data_structure import *
from mpm_utils import *
from typing import Optional, Union, Sequence, Any, Tuple
from jaxtyping import Float, Int, Shaped


class MPMWARPDiff(object):
    # def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
    #     self.initialize(n_particles, n_grid, grid_lim, device=device)
    #     self.time_profile = {}

    def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.initialize(n_particles, n_grid, grid_lim, device=device)
        self.time_profile = {}

    def initialize(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.n_particles = n_particles

        self.time = 0.0

        self.grid_postprocess = []
        self.collider_params = []
        self.modify_bc = []

        self.tailored_struct_for_bc = MPMtailoredStruct()
        self.pre_p2g_operations = []
        self.impulse_params = []

        self.particle_velocity_modifiers = []
        self.particle_velocity_modifier_params = []

    # must give density. mass will be updated as density * volume
    def set_parameters(self, device="cuda:0", **kwargs):
        self.set_parameters_dict(device, kwargs)

    def set_parameters_dict(self, mpm_model, mpm_state, kwargs={}, device="cuda:0"):
        if "material" in kwargs:
            if kwargs["material"] == "jelly":
                mpm_model.material = 0
            elif kwargs["material"] == "metal":
                mpm_model.material = 1
            elif kwargs["material"] == "sand":
                mpm_model.material = 2
            elif kwargs["material"] == "foam":
                mpm_model.material = 3
            elif kwargs["material"] == "snow":
                mpm_model.material = 4
            elif kwargs["material"] == "plasticine":
                mpm_model.material = 5
            elif kwargs["material"] == "neo-hookean":
                mpm_model.material = 6
            else:
                raise TypeError("Undefined material type")

        if "yield_stress" in kwargs:
            val = kwargs["yield_stress"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.yield_stress, val],
                device=device,
            )
        if "hardening" in kwargs:
            mpm_model.hardening = kwargs["hardening"]
        if "xi" in kwargs:
            mpm_model.xi = kwargs["xi"]
        if "friction_angle" in kwargs:
            mpm_model.friction_angle = kwargs["friction_angle"]
            sin_phi = wp.sin(mpm_model.friction_angle / 180.0 * 3.14159265)
            mpm_model.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        if "g" in kwargs:
            mpm_model.gravitational_accelaration = wp.vec3(
                kwargs["g"][0], kwargs["g"][1], kwargs["g"][2]
            )

        if "density" in kwargs:
            density_value = kwargs["density"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_state.particle_density, density_value],
                device=device,
            )
            wp.launch(
                kernel=get_float_array_product,
                dim=self.n_particles,
                inputs=[
                    mpm_state.particle_density,
                    mpm_state.particle_vol,
                    mpm_state.particle_mass,
                ],
                device=device,
            )
        if "rpic_damping" in kwargs:
            mpm_model.rpic_damping = kwargs["rpic_damping"]
        if "plastic_viscosity" in kwargs:
            mpm_model.plastic_viscosity = kwargs["plastic_viscosity"]
        if "softening" in kwargs:
            mpm_model.softening = kwargs["softening"]
        if "grid_v_damping_scale" in kwargs:
            mpm_model.grid_v_damping_scale = kwargs["grid_v_damping_scale"]

    def set_E_nu(self, mpm_model, E: float, nu: float, device="cuda:0"):
        if isinstance(E, float):
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.E, E],
                device=device,
            )
        else:  # E is warp array
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=self.n_particles,
                inputs=[mpm_model.E, E],
                device=device,
            )

        if isinstance(nu, float):
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.nu, nu],
                device=device,
            )
        else:
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=self.n_particles,
                inputs=[mpm_model.nu, nu],
                device=device,
            )

    def set_E_nu_from_torch(
        self,
        mpm_model,
        E: Float[Tensor, "n"] | Float[Tensor, "1"],
        nu: Float[Tensor, "n"] | Float[Tensor, "1"],
        device="cuda:0",
    ):
        if E.ndim == 0:
            E_inp = E.item()  # float
        else:
            E_inp = from_torch_safe(E, dtype=wp.float32, requires_grad=True)

        if nu.ndim == 0:
            nu_inp = nu.item()  # float
        else:
            nu_inp = from_torch_safe(nu, dtype=wp.float32, requires_grad=True)

        self.set_E_nu(mpm_model, E_inp, nu_inp, device=device)

    def prepare_mu_lam(self, mpm_model, mpm_state, device="cuda:0"):
        # compute mu and lam from E and nu
        wp.launch(
            kernel=compute_mu_lam_from_E_nu,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )

    def p2g2p_differentiable(
        self, mpm_model, mpm_state, next_state, dt, device="cuda:0"
    ):
        """
        Some boundary conditions, might not give gradient,
        see kernels in
            self.pre_p2g_operations,    Usually None.
            self.particle_velocity_modifiers.   Mostly used to freeze points
            self.grid_postprocess,      Should apply BC here
        """
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )
        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        # apply impulse force on particles..
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, mpm_state, self.impulse_params[k]],
                device=device,
            )

        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        # F_trail => F                    # TODO: this is overite..
        # F, SVD(F), lam, mu => Stress.   # TODO: this is overite..

        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

        if mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid, collide
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        mpm_state,
                        mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p_differentiable,
                dim=self.n_particles,
                inputs=[mpm_state, next_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        self.time = self.time + dt

    def p2g2p(self, mpm_model, mpm_state, step, dt, device="cuda:0"):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        # apply impulse force on particles..
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, mpm_state, self.impulse_params[k]],
                device=device,
            )

        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        # F_trail => F                    # TODO: this is overite..
        # F, SVD(F), lam, mu => Stress.   # TODO: this is overite..

        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

        if mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid, collide
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        mpm_state,
                        mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        #### CFL check ####
        # particle_v = self.mpm_state.particle_v.numpy()
        # if np.max(np.abs(particle_v)) > self.mpm_model.dx / dt:
        #     print("max particle v: ", np.max(np.abs(particle_v)))
        #     print("max allowed  v: ", self.mpm_model.dx / dt)
        #     print("does not allow v*dt>dx")
        #     input()
        #### CFL check ####
        with wp.ScopedTimer(
            "clip_particle_x", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=clip_particle_x,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model],
                device=device,
            )

        self.time = self.time + dt

    def print_time_profile(self):
        print("MPM Time profile:")
        for key, value in self.time_profile.items():
            print(key, sum(value))

    # a surface specified by a point and the normal vector
    def add_surface_collider(
        self,
        point,
        normal,
        surface="sticky",
        friction=0.0,
        start_time=0.0,
        end_time=999.0,
    ):
        point = list(point)
        # Normalize normal
        normal_scale = 1.0 / wp.sqrt(float(sum(x**2 for x in normal)))
        normal = list(normal_scale * x for x in normal)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.normal = wp.vec3(normal[0], normal[1], normal[2])

        if surface == "sticky" and friction != 0:
            raise ValueError("friction must be 0 on sticky surfaces.")
        if surface == "sticky":
            collider_param.surface_type = 0
        elif surface == "slip":
            collider_param.surface_type = 1
        elif surface == "cut":
            collider_param.surface_type = 11
        else:
            collider_param.surface_type = 2
        # frictional
        collider_param.friction = friction

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                n = wp.vec3(param.normal[0], param.normal[1], param.normal[2])
                dotproduct = wp.dot(offset, n)

                if dotproduct < 0.0:
                    if param.surface_type == 0:
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )
                    elif param.surface_type == 11:
                        if (
                            float(grid_z) * model.dx < 0.4
                            or float(grid_z) * model.dx > 0.53
                        ):
                            state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                                0.0, 0.0, 0.0
                            )
                        else:
                            v_in = state.grid_v_out[grid_x, grid_y, grid_z]
                            state.grid_v_out[grid_x, grid_y, grid_z] = (
                                wp.vec3(v_in[0], 0.0, v_in[2]) * 0.3
                            )
                    else:
                        v = state.grid_v_out[grid_x, grid_y, grid_z]
                        normal_component = wp.dot(v, n)
                        if param.surface_type == 1:
                            v = (
                                v - normal_component * n
                            )  # Project out all normal component
                        else:
                            v = (
                                v - wp.min(normal_component, 0.0) * n
                            )  # Project out only inward normal component
                        if normal_component < 0.0 and wp.length(v) > 1e-20:
                            v = wp.max(
                                0.0, wp.length(v) + normal_component * param.friction
                            ) * wp.normalize(
                                v
                            )  # apply friction here
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # a cubiod is a rectangular cube'
    # centered at `point`
    # dimension is x: point[0]±size[0]
    #              y: point[1]±size[1]
    #              z: point[2]±size[2]
    # all grid nodes lie within the cubiod will have their speed set to velocity
    # the cuboid itself is also moving with const speed = velocity
    # set the speed to zero to fix BC
    def set_velocity_on_cuboid(
        self,
        point,
        size,
        velocity,
        start_time=0.0,
        end_time=999.0,
        reset=0,
    ):
        point = list(point)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time
        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.size = size
        collider_param.velocity = wp.vec3(velocity[0], velocity[1], velocity[2])
        # collider_param.threshold = threshold
        collider_param.reset = reset
        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                if (
                    wp.abs(offset[0]) < param.size[0]
                    and wp.abs(offset[1]) < param.size[1]
                    and wp.abs(offset[2]) < param.size[2]
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = param.velocity
            elif param.reset == 1:
                if time < param.end_time + 15.0 * dt:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        def modify(time, dt, param: Dirichlet_collider):
            if time >= param.start_time and time < param.end_time:
                param.point = wp.vec3(
                    param.point[0] + dt * param.velocity[0],
                    param.point[1] + dt * param.velocity[1],
                    param.point[2] + dt * param.velocity[2],
                )  # param.point + dt * param.velocity

        self.grid_postprocess.append(collide)
        self.modify_bc.append(modify)

    def add_bounding_box(self, start_time=0.0, end_time=999.0):
        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            padding = 3
            if time >= param.start_time and time < param.end_time:
                if grid_x < padding and state.grid_v_out[grid_x, grid_y, grid_z][0] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_x >= model.grid_dim_x - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][0] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_y < padding and state.grid_v_out[grid_x, grid_y, grid_z][1] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_y >= model.grid_dim_y - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][1] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_z < padding and state.grid_v_out[grid_x, grid_y, grid_z][2] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )
                if (
                    grid_z >= model.grid_dim_z - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][2] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles(
        self,
        mpm_state,
        force,
        dt,
        point=[1, 1, 1],
        size=[1, 1, 1],
        num_dt=1,
        start_time=0.0,
        device="cuda:0",
    ):
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = start_time + dt * num_dt

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])
        impulse_param.mask = wp.zeros(shape=self.n_particles, dtype=int, device=device)

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] == 1:
                    impulse = wp.vec3(
                        param.force[0] / state.particle_mass[p],
                        param.force[1] / state.particle_mass[p],
                        param.force[2] / state.particle_mass[p],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)

    def enforce_particle_velocity_translation(
        self, mpm_state, point, size, velocity, start_time, end_time, device="cuda:0"
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.size = wp.vec3(size[0], size[1], size[2])

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_translation,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # define a cylinder with center point, half_height, radius, normal
    # particles within the cylinder are rotating along the normal direction
    # may also have a translational velocity along the normal direction
    def enforce_particle_velocity_rotation(
        self,
        mpm_state,
        point,
        normal,
        half_height_and_radius,
        rotation_scale,
        translation_scale,
        start_time,
        end_time,
        device="cuda:0",
    ):
        normal_scale = 1.0 / wp.sqrt(
            float(normal[0] ** 2 + normal[1] ** 2 + normal[2] ** 2)
        )
        normal = list(normal_scale * x for x in normal)

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.half_height_and_radius = wp.vec2(
            half_height_and_radius[0], half_height_and_radius[1]
        )
        velocity_modifier_params.normal = wp.vec3(normal[0], normal[1], normal[2])

        horizontal_1 = wp.vec3(1.0, 1.0, 1.0)
        if wp.abs(wp.dot(velocity_modifier_params.normal, horizontal_1)) < 0.01:
            horizontal_1 = wp.vec3(0.72, 0.37, -0.67)
        horizontal_1 = (
            horizontal_1
            - wp.dot(horizontal_1, velocity_modifier_params.normal)
            * velocity_modifier_params.normal
        )
        horizontal_1 = horizontal_1 * (1.0 / wp.length(horizontal_1))
        horizontal_2 = wp.cross(horizontal_1, velocity_modifier_params.normal)

        velocity_modifier_params.horizontal_axis_1 = horizontal_1
        velocity_modifier_params.horizontal_axis_2 = horizontal_2

        velocity_modifier_params.rotation_scale = rotation_scale
        velocity_modifier_params.translation_scale = translation_scale

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_cylinder,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    offset = state.particle_x[p] - velocity_modifier_params.point
                    horizontal_distance = wp.length(
                        offset
                        - wp.dot(offset, velocity_modifier_params.normal)
                        * velocity_modifier_params.normal
                    )
                    cosine = (
                        wp.dot(offset, velocity_modifier_params.horizontal_axis_1)
                        / horizontal_distance
                    )
                    theta = wp.acos(cosine)
                    if wp.dot(offset, velocity_modifier_params.horizontal_axis_2) > 0:
                        theta = theta
                    else:
                        theta = -theta
                    axis1_scale = (
                        -horizontal_distance
                        * wp.sin(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis2_scale = (
                        horizontal_distance
                        * wp.cos(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis_vertical_scale = translation_scale
                    state.particle_v[p] = (
                        axis1_scale * velocity_modifier_params.horizontal_axis_1
                        + axis2_scale * velocity_modifier_params.horizontal_axis_2
                        + axis_vertical_scale * velocity_modifier_params.normal
                    )

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # given normal direction, say [0,0,1]
    # gradually release grid velocities from start position to end position
    def release_particles_sequentially(
        self, normal, start_position, end_position, num_layers, start_time, end_time
    ):
        num_layers = 50
        point = [0, 0, 0]
        size = [0, 0, 0]
        axis = -1
        for i in range(3):
            if normal[i] == 0:
                point[i] = 1
                size[i] = 1
            else:
                axis = i
                point[i] = end_position

        half_length_portion = wp.abs(start_position - end_position) / num_layers
        end_time_portion = end_time / num_layers
        for i in range(num_layers):
            size[axis] = half_length_portion * (num_layers - i)
            self.enforce_particle_velocity_translation(
                point=point,
                size=size,
                velocity=[0, 0, 0],
                start_time=start_time,
                end_time=end_time_portion * (i + 1),
            )

    def enforce_particle_velocity_by_mask(
        self,
        mpm_state,
        selection_mask: torch.Tensor,
        velocity,
        start_time,
        end_time,
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0],
            velocity[1],
            velocity[2],
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.from_torch(selection_mask)

        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    def restart_and_compute_F_C(self, mpm_model, mpm_state, target_pos, device):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        wp.launch(
            set_F_C_p2g,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model, target_pos],
            device=device,
        )

        wp.launch(
            kernel=grid_normalization_and_gravity,
            dim=(grid_size),
            inputs=[mpm_state, mpm_model, 0],
            device=device,
        )

        wp.launch(
            set_F_C_g2p,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # set position to target_pos
        wp.launch(
            kernel=set_vec3_to_vec3,
            dim=self.n_particles,
            inputs=[mpm_state.particle_x, target_pos],
            device=device,
        )

    def enforce_grid_velocity_by_mask(
        self,
        selection_mask: torch.Tensor,  # should be int
    ):

        grid_modifier_params = GridCollider()

        grid_modifier_params.mask = wp.from_torch(selection_mask)

        self.collider_params.append(grid_modifier_params)

        @wp.kernel
        def modify_grid_v_before_g2p(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            grid_modifier_params: GridCollider,
        ):
            grid_x, grid_y, grid_z = wp.tid()

            if grid_modifier_params.mask[grid_x, grid_y, grid_z] >= 1:
                state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        self.grid_postprocess.append(modify_grid_v_before_g2p)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles_with_mask(
        self,
        mpm_state,
        force,
        dt,
        particle_mask,  # 1 for selected particles, 0 for others
        point=[1, 1, 1],
        size=[1, 1, 1],
        end_time=1,
        start_time=0.0,
        device="cuda:0",
    ):
        assert (
            len(particle_mask) == self.n_particles
        ), "mask should have n_particles elements"
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = end_time
        impulse_param.mask = wp.from_torch(particle_mask)

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] >= 1:
                    # impulse = wp.vec3(
                    #     param.force[0] / state.particle_mass[p],
                    #     param.force[1] / state.particle_mass[p],
                    #     param.force[2] / state.particle_mass[p],
                    # )
                    impulse = wp.vec3(
                        param.force[0],
                        param.force[1],
                        param.force[2],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)


================================================
FILE: physdreamer/warp_mpm/mpm_utils.py
================================================
import warp as wp
from mpm_data_structure import *
import numpy as np
import math


# compute stress from F
@wp.func
def kirchoff_stress_FCR(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, mu: float, lam: float
):
    # compute kirchoff stress for FCR model (remember tau = P F^T)
    R = U * wp.transpose(V)
    id = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    return 2.0 * mu * (F - R) * wp.transpose(F) + id * lam * J * (J - 1.0)


@wp.func
def kirchoff_stress_neoHookean(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, sig: wp.vec3, mu: float, lam: float
):
    """
    B = F * wp.transpose(F)
    dev(B) = B - (1/3) * tr(B) * I

    For a compressible Rivlin neo-Hookean materia, the cauchy stress is given by:
    mu * J^(-2/3) * dev(B) + lam * J (J - 1) * I
    see: https://en.wikipedia.org/wiki/Neo-Hookean_solid
    """

    # compute kirchoff stress for FCR model (remember tau = P F^T)
    b = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    b_hat = b - wp.vec3(
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
    )
    tau = mu * J ** (-2.0 / 3.0) * b_hat + lam / 2.0 * (J * J - 1.0) * wp.vec3(
        1.0, 1.0, 1.0
    )

    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_StVK(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    sig = wp.vec3(
        wp.max(sig[0], 0.01), wp.max(sig[1], 0.01), wp.max(sig[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    ONE = wp.vec3(1.0, 1.0, 1.0)
    tau = 2.0 * mu * epsilon + lam * log_sig_sum * ONE
    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_drucker_prager(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    center00 = 2.0 * mu * wp.log(sig[0]) * (1.0 / sig[0]) + lam * log_sig_sum * (
        1.0 / sig[0]
    )
    center11 = 2.0 * mu * wp.log(sig[1]) * (1.0 / sig[1]) + lam * log_sig_sum * (
        1.0 / sig[1]
    )
    center22 = 2.0 * mu * wp.log(sig[2]) * (1.0 / sig[2]) + lam * log_sig_sum * (
        1.0 / sig[2]
    )
    center = wp.mat33(center00, 0.0, 0.0, 0.0, center11, 0.0, 0.0, 0.0, center22)
    return U * center * wp.transpose(V) * wp.transpose(F)


@wp.func
def von_mises_return_mapping(F_trial: wp.mat33, model: MPMModelStruct, p: int):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


@wp.func
def von_mises_return_mapping_with_damage(
    F_trial: wp.mat33, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        if model.yield_stress[p] <= 0:
            return F_trial
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        model.yield_stress[p] = model.yield_stress[p] - model.softening * wp.length(
            (delta_gamma / epsilon_hat_norm) * epsilon_hat
        )
        if model.yield_stress[p] <= 0:
            model.mu[p] = 0.0
            model.lam[p] = 0.0
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


# for toothpaste
@wp.func
def viscoplasticity_return_mapping_with_StVK(
    F_trial: wp.mat33, model: MPMModelStruct, p: int, dt: float
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    b_trial = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    trace_epsilon = epsilon[0] + epsilon[1] + epsilon[2]
    epsilon_hat = epsilon - wp.vec3(
        trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
    )
    s_trial = 2.0 * model.mu[p] * epsilon_hat
    s_trial_norm = wp.length(s_trial)
    y = s_trial_norm - wp.sqrt(2.0 / 3.0) * model.yield_stress[p]
    if y > 0:
        mu_hat = model.mu[p] * (b_trial[0] + b_trial[1] + b_trial[2]) / 3.0
        s_new_norm = s_trial_norm - y / (
            1.0 + model.plastic_viscosity / (2.0 * mu_hat * dt)
        )
        s_new = (s_new_norm / s_trial_norm) * s_trial
        epsilon_new = 1.0 / (2.0 * model.mu[p]) * s_new + wp.vec3(
            trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
        )
        sig_elastic = wp.mat33(
            wp.exp(epsilon_new[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        return F_elastic
    else:
        return F_trial


@wp.func
def sand_return_mapping(
    F_trial: wp.mat33, state: MPMStateStruct, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig, V)

    epsilon = wp.vec3(
        wp.log(wp.max(wp.abs(sig[0]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[1]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[2]), 1e-14)),
    )
    sigma_out = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    tr = epsilon[0] + epsilon[1] + epsilon[2]  # + state.particle_Jp[p]
    epsilon_hat = epsilon - wp.vec3(tr / 3.0, tr / 3.0, tr / 3.0)
    epsilon_hat_norm = wp.length(epsilon_hat)
    delta_gamma = (
        epsilon_hat_norm
        + (3.0 * model.lam[p] + 2.0 * model.mu[p])
        / (2.0 * model.mu[p])
        * tr
        * model.alpha
    )

    if delta_gamma <= 0:
        F_elastic = F_trial

    if delta_gamma > 0 and tr > 0:
        F_elastic = U * wp.transpose(V)

    if delta_gamma > 0 and tr <= 0:
        H = epsilon - epsilon_hat * (delta_gamma / epsilon_hat_norm)
        s_new = wp.vec3(wp.exp(H[0]), wp.exp(H[1]), wp.exp(H[2]))

        F_elastic = U * wp.diag(s_new) * wp.transpose(V)
    return F_elastic


@wp.kernel
def compute_mu_lam_from_E_nu(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    model.mu[p] = model.E[p] / (2.0 * (1.0 + model.nu[p]))
    model.lam[p] = (
        model.E[p] * model.nu[p] / ((1.0 + model.nu[p]) * (1.0 - 2.0 * model.nu[p]))
    )


@wp.kernel
def zero_grid(state: MPMStateStruct, model: MPMModelStruct):
    grid_x, grid_y, grid_z = wp.tid()
    state.grid_m[grid_x, grid_y, grid_z] = 0.0
    state.grid_v_in[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)
    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)


@wp.func
def compute_dweight(
    model: MPMModelStruct, w: wp.mat33, dw: wp.mat33, i: int, j: int, k: int
):
    dweight = wp.vec3(
        dw[0, i] * w[1, j] * w[2, k],
        w[0, i] * dw[1, j] * w[2, k],
        w[0, i] * w[1, j] * dw[2, k],
    )
    return dweight * model.inv_dx


@wp.func
def update_cov(state: MPMStateStruct, p: int, grad_v: wp.mat33, dt: float):
    cov_n = wp.mat33(0.0)
    cov_n[0, 0] = state.particle_cov[p * 6]
    cov_n[0, 1] = state.particle_cov[p * 6 + 1]
    cov_n[0, 2] = state.particle_cov[p * 6 + 2]
    cov_n[1, 0] = state.particle_cov[p * 6 + 1]
    cov_n[1, 1] = state.particle_cov[p * 6 + 3]
    cov_n[1, 2] = state.particle_cov[p * 6 + 4]
    cov_n[2, 0] = state.particle_cov[p * 6 + 2]
    cov_n[2, 1] = state.particle_cov[p * 6 + 4]
    cov_n[2, 2] = state.particle_cov[p * 6 + 5]

    cov_np1 = cov_n + dt * (grad_v * cov_n + cov_n * wp.transpose(grad_v))

    state.particle_cov[p * 6] = cov_np1[0, 0]
    state.particle_cov[p * 6 + 1] = cov_np1[0, 1]
    state.particle_cov[p * 6 + 2] = cov_np1[0, 2]
    state.particle_cov[p * 6 + 3] = cov_np1[1, 1]
    state.particle_cov[p * 6 + 4] = cov_np1[1, 2]
    state.particle_cov[p * 6 + 5] = cov_np1[2, 2]


@wp.func
def update_cov_differentiable(
    state: MPMStateStruct,
    next_state: MPMStateStruct,
    p: int,
    grad_v: wp.mat33,
    dt: float,
):
    cov_n = wp.mat33(0.0)
    cov_n[0, 0] = state.particle_cov[p * 6]
    cov_n[0, 1] = state.particle_cov[p * 6 + 1]
    cov_n[0, 2] = state.particle_cov[p * 6 + 2]
    cov_n[1, 0] = state.particle_cov[p * 6 + 1]
    cov_n[1, 1] = state.particle_cov[p * 6 + 3]
    cov_n[1, 2] = state.particle_cov[p * 6 + 4]
    cov_n[2, 0] = state.particle_cov[p * 6 + 2]
    cov_n[2, 1] = state.particle_cov[p * 6 + 4]
    cov_n[2, 2] = state.particle_cov[p * 6 + 5]

    cov_np1 = cov_n + dt * (grad_v * cov_n + cov_n * wp.transpose(grad_v))

    next_state.particle_cov[p * 6] = cov_np1[0, 0]
    next_state.particle_cov[p * 6 + 1] = cov_np1[0, 1]
    next_state.particle_cov[p * 6 + 2] = cov_np1[0, 2]
    next_state.particle_cov[p * 6 + 3] = cov_np1[1, 1]
    next_state.particle_cov[p * 6 + 4] = cov_np1[1, 2]
    next_state.particle_cov[p * 6 + 5] = cov_np1[2, 2]


@wp.kernel
def p2g_apic_with_stress(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    # input given to p2g:   particle_stress
    #                       particle_x
    #                       particle_v
    #                       particle_C
    # output:               grid_v_in, grid_m
    p = wp.tid()
    if state.particle_selection[p] == 0:
        stress = state.particle_stress[p]
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    dpos = (
                        wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    ) * model.dx
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    dweight = compute_dweight(model, w, dw, i, j, k)

                    C = state.particle_C[p]
                    # if model.rpic = 0, standard apic
                    C = (1.0 - model.rpic_damping) * C + model.rpic_damping / 2.0 * (
                        C - wp.transpose(C)
                    )

                    # C = (1.0 - model.rpic_damping) * state.particle_C[
                    #     p
                    # ] + model.rpic_damping / 2.0 * (
                    #     state.particle_C[p] - wp.transpose(state.particle_C[p])
                    # )

                    if model.rpic_damping < -0.001:
                        # standard pic
                        C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                    elastic_force = -state.particle_vol[p] * stress * dweight
                    v_in_add = (
                        weight
                        * state.particle_mass[p]
                        * (state.particle_v[p] + C * dpos)
                        + dt * elastic_force
                    )
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


# add gravity
@wp.kernel
def grid_normalization_and_gravity(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    grid_x, grid_y, grid_z = wp.tid()
    if state.grid_m[grid_x, grid_y, grid_z] > 1e-15:
        v_out = state.grid_v_in[grid_x, grid_y, grid_z] * (
            1.0 / state.grid_m[grid_x, grid_y, grid_z]
        )
        # add gravity
        v_out = v_out + dt * model.gravitational_accelaration
        state.grid_v_out[grid_x, grid_y, grid_z] = v_out


@wp.kernel
def g2p(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = new_v + grid_v * weight
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        state.particle_v[p] = new_v
        # state.particle_x[p] = state.particle_x[p] + dt * new_v
        # state.particle_x[p] = state.particle_x[p] + dt * state.particle_v[p]

        # wp.atomic_add(state.particle_x, p, dt * state.particle_v[p]) # old one is this..
        wp.atomic_add(state.particle_x, p, dt * new_v)  # debug
        # new_x = state.particle_x[p] + dt * state.particle_v[p]
        # state.particle_x[p] = new_x

        state.particle_C[p] = new_C

        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        state.particle_F_trial[p] = F_tmp
        # debug for jelly
        # wp.atomic_add(state.particle_F_trial, p, new_F * dt * state.particle_F[p])

        if model.update_cov_with_F:
            update_cov(state, p, new_F, dt)


@wp.kernel
def g2p_differentiable(
    state: MPMStateStruct, next_state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    """
    Compute:
        next_state.particle_v, next_state.particle_x, next_state.particle_C, next_state.particle_F_trial
    """
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        # new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_C = wp.mat33(new_v, new_v, new_v)
        
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = (
                        new_v + grid_v * weight
                    )  # TODO, check gradient from static loop
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        next_state.particle_v[p] = new_v

        # add clip here:
        new_x = state.particle_x[p] + dt * new_v
        dx = 1.0 / model.inv_dx
        a_min = dx * 2.0
        a_max = model.grid_lim - dx * 2.0

        new_x_clamped = wp.vec3(
            wp.clamp(new_x[0], a_min, a_max),
            wp.clamp(new_x[1], a_min, a_max),
            wp.clamp(new_x[2], a_min, a_max),
        )
        next_state.particle_x[p] = new_x_clamped

        # next_state.particle_x[p] = new_x

        next_state.particle_C[p] = new_C

        I33_1 = wp.vec3(1.0, 0.0, 0.0)
        I33_2 = wp.vec3(0.0, 1.0, 0.0)
        I33_3 = wp.vec3(0.0, 0.0, 1.0)
        I33 = wp.mat33(I33_1, I33_2, I33_3)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        next_state.particle_F_trial[p] = F_tmp

        if 0:
            update_cov_differentiable(state, next_state, p, new_F, dt)


@wp.kernel
def clip_particle_x(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    posx = state.particle_x[p]
    if state.particle_selection[p] == 0:
        dx = 1.0 / model.inv_dx
        a_min = dx * 2.0
        a_max = model.grid_lim - dx * 2.0
        new_x = wp.vec3(
            wp.clamp(posx[0], a_min, a_max),
            wp.clamp(posx[1], a_min, a_max),
            wp.clamp(posx[2], a_min, a_max),
        )

        state.particle_x[
            p
        ] = new_x  # Warn: this gives wrong gradient, don't use this for backward


# compute (Kirchhoff) stress = stress(returnMap(F_trial))
@wp.kernel
def compute_stress_from_F_trial(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    """
    state.particle_F_trial => state.particle_F   # return mapping
    state.particle_F => state.particle_stress    # stress-strain

    TODO: check the gradient of SVD!  is wp.svd3 differentiable? I guess so
    """
    p = wp.tid()
    if state.particle_selection[p] == 0:
        # apply return mapping
        if model.material == 1:  # metal
            state.particle_F[p] = von_mises_return_mapping(
                state.particle_F_trial[p], model, p
            )
        elif model.material == 2:  # sand
            state.particle_F[p] = sand_return_mapping(
                state.particle_F_trial[p], state, model, p
            )
        elif model.material == 3:  # visplas, with StVk+VM, no thickening
            state.particle_F[p] = viscoplasticity_return_mapping_with_StVK(
                state.particle_F_trial[p], model, p, dt
            )
        elif model.material == 5:
            state.particle_F[p] = von_mises_return_mapping_with_damage(
                state.particle_F_trial[p], model, p
            )
        else:  # elastic, jelly, or neo-hookean
            state.particle_F[p] = state.particle_F_trial[p]

        # also compute stress here
        J = wp.determinant(state.particle_F[p])
        U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        sig = wp.vec3(0.0)
        stress = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        wp.svd3(state.particle_F[p], U, sig, V)
        if model.material == 0 or model.material == 5:
            stress = kirchoff_stress_FCR(
                state.particle_F[p], U, V, J, model.mu[p], model.lam[p]
            )
        if model.material == 1:
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 2:
            stress = kirchoff_stress_drucker_prager(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 3:
            # temporarily use stvk, subject to change
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )

        if model.material == 6:
            stress = kirchoff_stress_neoHookean(
                state.particle_F[p], U, V, J, sig, model.mu[p], model.lam[p]
            )
        # stress = (stress + wp.transpose(stress)) / 2.0  # enfore symmetry
        state.particle_stress[p] = (stress + wp.transpose(stress)) / 2.0


# @wp.kernel
# def compute_cov_from_F(state: MPMStateStruct, model: MPMModelStruct):
#     p = wp.tid()

#     F = state.particle_F_trial[p]

#     init_cov = wp.mat33(0.0)
#     init_cov[0, 0] = state.particle_init_cov[p * 6]
#     init_cov[0, 1] = state.particle_init_cov[p * 6 + 1]
#     init_cov[0, 2] = state.particle_init_cov[p * 6 + 2]
#     init_cov[1, 0] = state.particle_init_cov[p * 6 + 1]
#     init_cov[1, 1] = state.particle_init_cov[p * 6 + 3]
#     init_cov[1, 2] = state.particle_init_cov[p * 6 + 4]
#     init_cov[2, 0] = state.particle_init_cov[p * 6 + 2]
#     init_cov[2, 1] = state.particle_init_cov[p * 6 + 4]
#     init_cov[2, 2] = state.particle_init_cov[p * 6 + 5]

#     cov = F * init_cov * wp.transpose(F)

#     state.particle_cov[p * 6] = cov[0, 0]
#     state.particle_cov[p * 6 + 1] = cov[0, 1]
#     state.particle_cov[p * 6 + 2] = cov[0, 2]
#     state.particle_cov[p * 6 + 3] = cov[1, 1]
#     state.particle_cov[p * 6 + 4] = cov[1, 2]
#     state.particle_cov[p * 6 + 5] = cov[2, 2]


# @wp.kernel
# def compute_R_from_F(state: MPMStateStruct, model: MPMModelStruct):
#     p = wp.tid()

#     F = state.particle_F_trial[p]

#     # polar svd decomposition
#     U = wp.mat33(0.0)
#     V = wp.mat33(0.0)
#     sig = wp.vec3(0.0)
#     wp.svd3(F, U, sig, V)

#     if wp.determinant(U) < 0.0:
#         U[0, 2] = -U[0, 2]
#         U[1, 2] = -U[1, 2]
#         U[2, 2] = -U[2, 2]

#     if wp.determinant(V) < 0.0:
#         V[0, 2] = -V[0, 2]
#         V[1, 2] = -V[1, 2]
#         V[2, 2] = -V[2, 2]

#     # compute rotation matrix
#     R = U * wp.transpose(V)
#     state.particle_R[p] = wp.transpose(R) # particle R is removed


@wp.kernel
def add_damping_via_grid(state: MPMStateStruct, scale: float):
    grid_x, grid_y, grid_z = wp.tid()
    # state.grid_v_out[grid_x, grid_y, grid_z] = (
    #     state.grid_v_out[grid_x, grid_y, grid_z] * scale
    # )
    wp.atomic_sub(
        state.grid_v_out,
        grid_x,
        grid_y,
        grid_z,
        (1.0 - scale) * state.grid_v_out[grid_x, grid_y, grid_z],
    )


@wp.kernel
def apply_additional_params(
    state: MPMStateStruct,
    model: MPMModelStruct,
    params_modifier: MaterialParamsModifier,
):
    p = wp.tid()
    pos = state.particle_x[p]
    if (
        pos[0] > params_modifier.point[0] - params_modifier.size[0]
        and pos[0] < params_modifier.point[0] + params_modifier.size[0]
        and pos[1] > params_modifier.point[1] - params_modifier.size[1]
        and pos[1] < params_modifier.point[1] + params_modifier.size[1]
        and pos[2] > params_modifier.point[2] - params_modifier.size[2]
        and pos[2] < params_modifier.point[2] + params_modifier.size[2]
    ):
        model.E[p] = params_modifier.E
        model.nu[p] = params_modifier.nu
        state.particle_density[p] = params_modifier.density


@wp.kernel
def selection_add_impulse_on_particles(
    state: MPMStateStruct, impulse_modifier: Impulse_modifier
):
    p = wp.tid()
    offset = state.particle_x[p] - impulse_modifier.point
    if (
        wp.abs(offset[0]) < impulse_modifier.size[0]
        and wp.abs(offset[1]) < impulse_modifier.size[1]
        and wp.abs(offset[2]) < impulse_modifier.size[2]
    ):
        impulse_modifier.mask[p] = 1
    else:
        impulse_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_translation(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point
    if (
        wp.abs(offset[0]) < velocity_modifier.size[0]
        and wp.abs(offset[1]) < velocity_modifier.size[1]
        and wp.abs(offset[2]) < velocity_modifier.size[2]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_cylinder(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point

    vertical_distance = wp.abs(wp.dot(offset, velocity_modifier.normal))

    horizontal_distance = wp.length(
        offset - wp.dot(offset, velocity_modifier.normal) * velocity_modifier.normal
    )
    if (
        vertical_distance < velocity_modifier.half_height_and_radius[0]
        and horizontal_distance < velocity_modifier.half_height_and_radius[1]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


@wp.kernel
def compute_position_l2_loss(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    l2 = wp.length(pos - pos_gt)

    wp.atomic_add(loss, 0, l2)


@wp.kernel
def aggregate_grad(x: wp.array(dtype=float), grad: wp.array(dtype=float)):
    tid = wp.tid()

    # gradient descent step
    wp.atomic_add(x, 0, grad[tid])


@wp.kernel
def set_F_C_p2g(
    state: MPMStateStruct, model: MPMModelStruct, target_pos: wp.array(dtype=wp.vec3)
):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        # p2g for displacement
        particle_disp = target_pos[p] - state.particle_x[p]
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    v_in_add = weight * state.particle_mass[p] * particle_disp
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


@wp.kernel
def set_F_C_g2p(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        # g2p for C and F
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        # C should still be zero..
        # state.particle_C[p] = new_C
        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = I33 + new_F
        state.particle_F_trial[p] = F_tmp

        if model.update_cov_with_F:
            update_cov(state, p, new_F, 1.0)


@wp.kernel
def compute_posloss_with_grad(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    grad: wp.array(dtype=wp.vec3),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    # l2 = wp.length(pos - (pos_gt - grad[tid] * dt))
    diff = pos - (pos_gt - grad[tid] * dt)
    l2 = wp.dot(diff, diff)
    wp.atomic_add(loss, 0, l2)


@wp.kernel
def compute_veloloss_with_grad(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    grad: wp.array(dtype=wp.vec3),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_v[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    # l2 = wp.length(pos - (pos_gt - grad[tid] * dt))

    diff = pos - (pos_gt - grad[tid] * dt)
    l2 = wp.dot(diff, diff)
    wp.atomic_add(loss, 0, l2)


@wp.kernel
def compute_Floss_with_grad(
    mpm_state: MPMStateStruct,
    gt_mat: wp.array(dtype=wp.mat33),
    grad: wp.array(dtype=wp.mat33),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    mat_ = mpm_state.particle_F_trial[tid]
    mat_gt = gt_mat[tid]

    mat_gt = mat_gt - grad[tid] * dt
    # l1_diff = wp.abs(pos - pos_gt)
    mat_diff = mat_ - mat_gt

    l2 = wp.ddot(mat_diff, mat_diff)
    # l2 = wp.sqrt(
    #     mat_diff[0, 0] ** 2.0
    #     + mat_diff[0, 1] ** 2.0
    #     + mat_diff[0, 2] ** 2.0
    #     + mat_diff[1, 0] ** 2.0
    #     + mat_diff[1, 1] ** 2.0
    #     + mat_diff[1, 2] ** 2.0
    #     + mat_diff[2, 0] ** 2.0
    #     + mat_diff[2, 1] ** 2.0
    #     + mat_diff[2, 2] ** 2.0
    # )

    wp.atomic_add(loss, 0, l2)


@wp.kernel
def compute_Closs_with_grad(
    mpm_state: MPMStateStruct,
    gt_mat: wp.array(dtype=wp.mat33),
    grad: wp.array(dtype=wp.mat33),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    mat_ = mpm_state.particle_C[tid]
    mat_gt = gt_mat[tid]

    mat_gt = mat_gt - grad[tid] * dt
    # l1_diff = wp.abs(pos - pos_gt)

    mat_diff = mat_ - mat_gt
    l2 = wp.ddot(mat_diff, mat_diff)

    wp.atomic_add(loss, 0, l2)


================================================
FILE: physdreamer/warp_mpm/warp_utils.py
================================================
import warp as wp
import ctypes
from typing import Optional

from warp.torch import (
    dtype_from_torch,
    device_from_torch,
    dtype_is_compatible,
    from_torch,
)


def from_torch_safe(t, dtype=None, requires_grad=None, grad=None):
    """Wrap a PyTorch tensor to a Warp array without copying the data.

    Args:
        t (torch.Tensor): The torch tensor to wrap.
        dtype (warp.dtype, optional): The target data type of the resulting Warp array. Defaults to the tensor value type mapped to a Warp array value type.
        requires_grad (bool, optional): Whether the resulting array should wrap the tensor's gradient, if it exists (the grad tensor will be allocated otherwise). Defaults to the tensor's `requires_grad` value.

    Returns:
        warp.array: The wrapped array.
    """
    if dtype is None:
        dtype = dtype_from_torch(t.dtype)
    elif not dtype_is_compatible(t.dtype, dtype):
        raise RuntimeError(f"Incompatible data types: {t.dtype} and {dtype}")

    # get size of underlying data type to compute strides
    ctype_size = ctypes.sizeof(dtype._type_)

    shape = tuple(t.shape)
    strides = tuple(s * ctype_size for s in t.stride())

    # if target is a vector or matrix type
    # then check if trailing dimensions match
    # the target type and update the shape
    if hasattr(dtype, "_shape_"):
        dtype_shape = dtype._shape_
        dtype_dims = len(dtype._shape_)
        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
            raise RuntimeError(
                f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
            )

        # ensure the inner strides are contiguous
        stride = ctype_size
        for i in range(dtype_dims):
            if strides[-i - 1] != stride:
                raise RuntimeError(
                    f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
                )
            stride *= dtype_shape[-i - 1]

        shape = tuple(shape[:-dtype_dims]) or (1,)
        strides = tuple(strides[:-dtype_dims]) or (ctype_size,)

    requires_grad = t.requires_grad if requires_grad is None else requires_grad
    if grad is not None:
        if not isinstance(grad, wp.array):
            import torch

            if isinstance(grad, torch.Tensor):
                grad = from_torch(grad, dtype=dtype)
            else:
                raise ValueError(f"Invalid gradient type: {type(grad)}")
    elif requires_grad:
        # wrap the tensor gradient, allocate if necessary
        if t.grad is None:
            # allocate a zero-filled gradient tensor if it doesn't exist
            import torch

            t.grad = torch.zeros_like(t, requires_grad=False)
        grad = from_torch(t.grad, dtype=dtype)

    a = wp.types.array(
        ptr=t.data_ptr(),
        dtype=dtype,
        shape=shape,
        strides=strides,
        device=device_from_torch(t.device),
        copy=False,
        owner=False,
        grad=grad,
        requires_grad=requires_grad,
    )

    # save a reference to the source tensor, otherwise it will be deallocated
    a._tensor = t
    return a


class MyTape(wp.Tape):
    # returns the adjoint of a kernel parameter
    def get_adjoint(self, a):
        if not wp.types.is_array(a) and not isinstance(a, wp.codegen.StructInstance):
            # if input is a simple type (e.g.: float, vec3, etc) then
            # no gradient needed (we only return gradients through arrays and structs)
            return a

        elif wp.types.is_array(a) and a.grad:
            # keep track of all gradients used by the tape (for zeroing)
            # ignore the scalar loss since we don't want to clear its grad
            self.gradients[a] = a.grad
            return a.grad

        elif isinstance(a, wp.codegen.StructInstance):
            adj = a._cls()
            for name, _ in a._cls.ctype._fields_:
                if name.startswith("_"):
                    continue
                if isinstance(a._cls.vars[name].type, wp.array):
                    arr = getattr(a, name)
                    if arr is None:
                        continue
                    if arr.grad:
                        grad = self.gradients[arr] = arr.grad
                    else:
                        grad = wp.zeros_like(arr)
                    setattr(adj, name, grad)
                else:
                    setattr(adj, name, getattr(a, name))

            self.gradients[a] = adj
            return adj

        return None


# from https://github.com/PingchuanMa/NCLaw/blob/main/nclaw/warp/tape.py
class CondTape(object):
    def __init__(self, tape: Optional[MyTape], cond: bool = True) -> None:
        self.tape = tape
        self.cond = cond

    def __enter__(self):
        if self.tape is not None and self.cond:
            self.tape.__enter__()

    def __exit__(self, exc_type, exc_value, traceback):
        if self.tape is not None and self.cond:
            self.tape.__exit__(exc_type, exc_value, traceback)

================================================
FILE: projects/inference/README.md
================================================
## How to run

**config file**

The config files for four scenes: carnation, aloacasia, hat, telephone is in `configs/` folder. Please check the path for `dataset_dir` and `model_list` is correct after you download all the models. 

**inference.py** 

Please follow `run.sh` for common args. 

If you encounter OOM error, it's very likely due to the Kmeans downsampling operations. See line ~260 of `inference.py`:

``` python
# WARNING: this is a GPU implementation, and will be OOM if the number of points is large
# you might want to use a CPU implementation if the number of points is large
# For CPU implementation: uncomment the following lines
# from local_utils import downsample_with_kmeans
# sim_xyzs = downsample_with_kmeans(sim_xyzs.detach().cpu().numpy(), num_cluster)
# sim_xyzs = torch.from_numpy(sim_xyzs).float().to(device)
sim_xyzs = downsample_with_kmeans_gpu(sim_xyzs, num_cluster)
```


================================================
FILE: projects/inference/config_demo.py
================================================
import numpy as np

# from model_config import (
#     model_list,
#     camera_cfg_list,
#     points_list,
#     force_directions,
#     simulate_cfg,
#     dataset_dir,
#     result_dir,
#     exp_name,
# )
import importlib.util
import os


class DemoParams(object):
    def __init__(self, scene_name):

        self.scene_name = scene_name
        base_dir = os.path.dirname(__file__)

        # import_file_path = ".configs." + scene_name
        import_file_path = os.path.join(base_dir, "configs", scene_name + ".py")
        print("loading scene params from: ", import_file_path)
        spec = importlib.util.spec_from_file_location(scene_name, import_file_path)
        if spec is None:
            print(f"Could not load the spec for: {import_file_path}")
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)

        self.model_list = module.model_list
        self.camera_cfg_list = module.camera_cfg_list
        self.points_list = module.points_list
        self.force_directions = module.force_directions
        self.simulate_cfg = module.simulate_cfg
        self.dataset_dir = module.dataset_dir
        self.result_dir = module.result_dir
        self.exp_name = module.exp_name

        substep = self.simulate_cfg["substep"]
        grid_size = self.simulate_cfg["grid_size"]
        self.init_youngs = self.simulate_cfg["init_young"]
        self.downsample_scale = self.simulate_cfg["downsample_scale"]

        self.demo_dict = {
            "baseline": {
                "model_path": self.model_list[0],
                "substep": substep,
                "grid_size": grid_size,
                "name": "baseline",
                "camera_cfg": self.camera_cfg_list[0],
                "cam_id": 0,
                "init_youngs": self.init_youngs,
                "downsample_scale": self.downsample_scale,
            }
        }

    def get_cfg(
        self,
        demo_name=None,
        model_id: int = 0,
        eval_ys: float = 1.0,
        force_id: int = 0,
        force_mag: float = 1.0,
        velo_scaling: float = 3.0,
        point_id: int = 0,
        cam_id: int = 0,
        apply_force: bool = False,
    ):
        if demo_name == "None":
            demo_name = None
        if (demo_name is not None) and (demo_name in self.demo_dict):
            cfg = self.demo_dict[demo_name]
        else:
            cfg = {}
            cfg["model_path"] = self.model_list[model_id]
            cfg["center_point"] = self.points_list[point_id]
            cfg["force"] = self.force_directions[force_id] * force_mag
            cfg["camera_cfg"] = self.camera_cfg_list[cam_id]
            cfg["cam_id"] = cam_id
            cfg["force_duration"] = 0.75
            cfg["force_radius"] = 0.1
            cfg["substep"] = self.simulate_cfg["substep"]
            cfg["grid_size"] = self.simulate_cfg["grid_size"]
            cfg["total_time"] = 5
            cfg["eval_ys"] = eval_ys
            cfg["velo_scaling"] = velo_scaling

            if demo_name is None:
                name = ""
            else:
                name = demo_name + "_"
            name = (
                name
                + f"{self.scene_name}_sv_gres{cfg['grid_size']}_substep{cfg['substep']}"
            )
            if eval_ys > 10:
                name += f"_eval_ys_{eval_ys}"
            else:
                name += f"_model_{model_id}"

            if apply_force:
                name += f"_force_{force_id}_mag_{force_mag}_point_{point_id}"
            else:
                name += f"_no_force_velo_{velo_scaling}"
            cfg["name"] = name

        cfg["dataset_dir"] = self.dataset_dir
        cfg["result_dir"] = self.result_dir
        cfg["init_youngs"] = self.init_youngs
        cfg["downsample_scale"] = self.downsample_scale

        return cfg


================================================
FILE: projects/inference/configs/alocasia.py
================================================
import numpy as np

dataset_dir = "../../data/physics_dreamer/alocasia/"
result_dir = "output/alocasia/results"
exp_name = "alocasia"

model_list = ["../../models/physdreamer/alocasia/model"]

focus_point_list = [
    np.array([-1.242875, -0.468537, -0.251450]),  # botton of the background
]

camera_cfg_list = [
    {
        "type": "spiral",
        "focus_point": focus_point_list[0],
        "radius": 0.1,
        "up": np.array([0, 0, 1]),
    },
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
        "end_frame": "frame_00019.png",
    },
    # real captured viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00236.png",
    },
    # another viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00006.png",
    },
    # another viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00095.png",
    },
]

simulate_cfg = {
    "substep": 768,
    "grid_size": 64,
    "init_young": 1e6,
    "downsample_scale": 0.1,  # downsample the points to speed up the simulation
}


points_list = [
    np.array([-0.508607, -0.180955, -0.123896]),  # top of the big stem
    np.array([-0.462227, -0.259485, -0.112966]),  # top of the second stem
    np.array([-0.728061, -0.092306, -0.149104]),  # top of the third stem
    np.array([-0.603330 - 0.204207 - 0.127469]),  # top of the 4th stem
    np.array([-0.408097, -0.076293, -0.110391]),  # top of the big leaf
    np.array([-0.391575, -0.224018, -0.052054]),  # top of the second leaf
    np.array([-0.768167, -0.032502, -0.143995]),  # top of the third leaf
    np.array([-0.633866, -0.170207, -0.103671]),  # top of the 4th leaf
]

force_directions = [
    np.array([1.0, 0.0, 0]),
    np.array([0.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([1.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([0.0, 1.0, 1.0]),
    np.array([1.0, 1.0, 1.0]),
]

force_directions = np.array(force_directions)
force_directions = force_directions / np.linalg.norm(force_directions, axis=1)[:, None]


================================================
FILE: projects/inference/configs/carnation.py
================================================
import numpy as np

dataset_dir = "../../data/physics_dreamer/carnations/"
result_dir = "output/carnations/demos"
exp_name = "carnations"


model_list = [
    "../../models/physdreamer/carnations/model",
]

focus_point_list = [
    np.array([0.189558, 2.064228, -0.216089]),  # botton of the background
]

camera_cfg_list = [
    {
        "type": "spiral",
        "focus_point": focus_point_list[0],
        "radius": 0.05,
        "up": np.array([0, -0.5, 1]),
    },
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
        "end_frame": "frame_00022.png",
    },
    # real capture viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00219.png",
    },
    # another render viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00106.png",
    },
    # another render viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00011.png",
    },
]

simulate_cfg = {
    "substep": 768,
    "grid_size": 64,
    "init_young": 2140628.25,  # save the initialized young's modulus, since optimized
    "downsample_scale": 0.1,  # downsample the points to speed up the simulation
}


points_list = [
    np.array([0.076272, 0.848310, 0.074134]),  # top of the flower
    np.array([0.057208, 0.848147, -0.013685]),  # middle of the flower
    np.array([0.134908, 0.912759, -0.023763]),  # top of the stem
    np.array([0.169540, 0.968676, -0.095261]),  # middle of the stem
    np.array([0.186664, 1.028284, -0.187793]),  # bottom of the stem
]

force_directions = [
    np.array([1.0, 0.0, 0]),
    np.array([0.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([1.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([0.0, 1.0, 1.0]),
    np.array([1.0, 1.0, 1.0]),
]

force_directions = np.array(force_directions)
force_directions = force_directions / np.linalg.norm(force_directions, axis=1)[:, None]

force_directions_old_carnations = [
    np.array([2.0, 1.0, 0]),  # horizontal to left
    np.array([0.0, 1.0, 2.0]),  # vertical to top
    np.array([1.0, 1.0, 1.0]),  # top right to bottom left
    np.array([0.0, 1.0, 0.0]),  # orthgonal to the screen,
]


================================================
FILE: projects/inference/configs/hat.py
================================================
import numpy as np

dataset_dir = "../../data/physics_dreamer/hat/"
result_dir = "output/hat/demo"
exp_name = "hat"

model_list = [
    "../../models/physdreamer/hat/model/",
]

focus_point_list = [
    np.array([-0.467188, 0.067178, 0.044333]),
]

camera_cfg_list = [
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
        "end_frame": "frame_00187.png",  # or 91
    },
    # real captured viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00217.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00079.png",
    },
]

simulate_cfg = {
    "substep": 384,
    "grid_size": 64,
    "init_young": 1e5,
    "downsample_scale": 0.04,
}


points_list = [
    np.array([-0.390069, 0.139051, -0.182607]),  # bottom of the hat
    np.array([-0.404391, 0.184975, -0.001585]),  # middle of the hat
    np.array([-0.289375, 0.034581, 0.062010]),  # left of the hat
    np.array([-0.352060, 0.105737, 0.009359]),  # center of the hat
]

force_directions = [
    np.array([1.0, 0.0, 0]),
    np.array([0.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([1.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([0.0, 1.0, 1.0]),
    np.array([1.0, 1.0, 1.0]),
]

force_directions = np.array(force_directions)
force_directions = force_directions / np.linalg.norm(force_directions, axis=1)[:, None]


================================================
FILE: projects/inference/configs/telephone.py
================================================
import numpy as np

exp_name = "telephone"
dataset_dir = "../../data/physics_dreamer/telephone/"
result_dir = "output/telephone/results"

model_list = ["../../models/physdreamer/telephone/model"]

focus_point_list = [
    np.array([-0.401468, 0.889287, -0.116852]),  # botton of the background
]

camera_cfg_list = [
    {
        "type": "spiral",
        "focus_point": focus_point_list[0],
        "radius": 0.1,
        "up": np.array([0, 0, 1]),
    },
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
        "end_frame": "frame_00019.png",
    },
    # real video viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00190.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00037.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00090.png",
    },
]

simulate_cfg = {
    "substep": 256,
    "grid_size": 96,
    "init_young": 1e5,
    "downsample_scale": 0.1,  # downsample the points to speed up the simulation
}


points_list = [
    np.array([-0.417240, 0.907780, -0.379144]),  # bottom of the lines.
    np.array([-0.374907, 0.796209, -0.178907]),  # middle of the right lines
    np.array([-0.414156, 0.901207, -0.182275]),  # middle of the left lines
]

force_directions = [
    np.array([1.0, 0.0, 0]),
    np.array([0.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([1.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([0.0, 1.0, 1.0]),
    np.array([1.0, 1.0, 1.0]),
]

force_directions = np.array(force_directions)
force_directions = force_directions / np.linalg.norm(force_directions, axis=1)[:, None]


================================================
FILE: projects/inference/demo.py
================================================
import argparse
import os
import numpy as np
import torch
from tqdm import tqdm
import point_cloud_utils as pcu
from accelerate.utils import ProjectConfiguration
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from accelerate import Accelerator, DistributedDataParallelKwargs
import numpy as np
import logging
import argparse
import torch
import os
from physdreamer.utils.config import create_config
import numpy as np

from physdreamer.gaussian_3d.scene import GaussianModel

from physdreamer.data.datasets.multiview_dataset import MultiviewImageDataset
from physdreamer.data.datasets.multiview_video_dataset import (
    MultiviewVideoDataset,
    camera_dataset_collate_fn,
)

from physdreamer.data.datasets.multiview_dataset import (
    camera_dataset_collate_fn as camera_dataset_collate_fn_img,
)

from typing import NamedTuple

from physdreamer.utils.img_utils import compute_psnr, compute_ssim
from physdreamer.warp_mpm.mpm_data_structure import (
    MPMStateStruct,
    MPMModelStruct,
)
from physdreamer.warp_mpm.mpm_solver_diff import MPMWARPDiff
from physdreamer.warp_mpm.gaussian_sim_utils import get_volume
import warp as wp

from local_utils import (
    cycle,
    create_spatial_fields,
    find_far_points,
    apply_grid_bc_w_freeze_pts,
    add_constant_force,
    downsample_with_kmeans_gpu,
    render_gaussian_seq_w_mask_with_disp,
    render_gaussian_seq_w_mask_cam_seq_with_force_with_disp,
    get_camera_trajectory,
    render_gaussian_seq_w_mask_with_disp_for_figure,
)
from config_demo import DemoParams
from physdreamer.utils.io_utils import save_video_mediapy


logger = get_logger(__name__, log_level="INFO")


def create_dataset(args):

    res = [576, 1024]
    video_dir_name = "videos"

    # dataset = MultiviewVideoDataset(
    #     args.dataset_dir,
    #     use_white_background=False,
    #     resolution=res,
    #     scale_x_angle=1.0,
    #     video_dir_name=video_dir_name,
    # )
    dataset = MultiviewImageDataset(
        args.dataset_dir,
        use_white_background=False,
        resolution=res,
        scale_x_angle=1.0,
        load_imgs=False,
    )

    test_dataset = MultiviewImageDataset(
        args.dataset_dir,
        use_white_background=False,
        resolution=res,
        # use_index=[0],
        scale_x_angle=1.0,
        fitler_with_renderd=False,
        load_imgs=False,
    )
    print("len of test dataset", len(test_dataset))
    return dataset, test_dataset


class Trainer:
    def __init__(self, args):
        self.args = args

        logging_dir = os.path.join(args.output_dir, "debug_demo")
        accelerator_project_config = ProjectConfiguration(logging_dir=logging_dir)
        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
        accelerator = Accelerator(
            mixed_precision="no",
            log_with="wandb",
            project_config=accelerator_project_config,
            kwargs_handlers=[ddp_kwargs],
        )
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO,
        )
        logger.info(accelerator.state, main_process_only=False)

        set_seed(args.seed + accelerator.process_index)

        demo_cfg = DemoParams(args.scene_name).get_cfg(
            args.demo_name,
            args.model_id,
            args.eval_ys,
            args.force_id,
            args.force_mag,
            args.velo_scaling,
            args.point_id,
            args.cam_id,
            args.apply_force,
        )
        self.args.dataset_dir = demo_cfg["dataset_dir"]
        self.demo_cfg = demo_cfg

        # setup the dataset
        dataset, test_dataset = create_dataset(args)
        # will be used when synthesize camera trajectory
        self.test_dataset = test_dataset
        self.dataset = dataset
        dataset_dir = test_dataset.data_dir

        gaussian_path = os.path.join(dataset_dir, "point_cloud.ply")
        self.setup_render(
            args,
            gaussian_path,
            white_background=True,
        )
        self.args.substep = demo_cfg["substep"]
        self.args.grid_size = demo_cfg["grid_size"]
        self.args.checkpoint_path = demo_cfg["model_path"]
        self.demo_cfg = demo_cfg

        self.num_frames = int(args.num_frames)

        dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=1,
            shuffle=False,
            drop_last=False,
            num_workers=0,
            # collate_fn=camera_dataset_collate_fn,
            collate_fn=camera_dataset_collate_fn_img,
        )
        dataloader = accelerator.prepare(dataloader)
        # why be used in self.compute_metric
        self.dataloader = cycle(dataloader)
        self.accelerator = accelerator

        # init traiable params
        E_nu_list = self.init_trainable_params()
        for p in E_nu_list:
            p.requires_grad = False
        self.E_nu_list = E_nu_list

        # init simulation enviroment
        self.setup_simulation(dataset_dir, grid_size=args.grid_size)

        if args.checkpoint_path == "None":
            args.checkpoint_path = None
        if args.checkpoint_path is not None:
            self.load(args.checkpoint_path)
        self.sim_fields, self.velo_fields = accelerator.prepare(
            self.sim_fields, self.velo_fields
        )

    def init_trainable_params(
        self,
    ):
        # init young modulus and poisson ratio
        # from pre-optimized;  gres32 step 128.  300 epoch. lr 10.0  psnr: 27.72028086735652.  Stop at 100epoch
        young_numpy = np.array([self.demo_cfg["init_youngs"]]).astype(np.float32)
        young_modulus = torch.tensor(young_numpy, dtype=torch.float32).to(
            self.accelerator.device
        )
        poisson_numpy = np.random.uniform(0.1, 0.4)
        poisson_ratio = torch.tensor(poisson_numpy, dtype=torch.float32).to(
            self.accelerator.device
        )
        trainable_params = [young_modulus, poisson_ratio]
        print(
            "init young modulus: ",
            young_modulus.item(),
            "poisson ratio: ",
            poisson_ratio.item(),
        )
        return trainable_params

    def setup_simulation(self, dataset_dir, grid_size=100):
        """
        1. load internal filled points.
        2. pointcloud downsample with KMeans
        3. Setup MPM simulation environment
        """

        device = "cuda:{}".format(self.accelerator.process_index)

        xyzs = self.render_params.gaussians.get_xyz.detach().clone()
        sim_xyzs = xyzs[self.sim_mask_in_raw_gaussian, :]

        # scale, and shift
        pos_max = sim_xyzs.max()
        pos_min = sim_xyzs.min()
        scale = (pos_max - pos_min) * 1.8
        shift = -pos_min + (pos_max - pos_min) * 0.25
        self.scale, self.shift = scale, shift
        print("scale, shift", scale, shift)

        # load internal filled points.
        #   if exists, we will use it to fill in the internal points, but not for rendering
        #   we keep track of render_mask_in_sim_pts, to distinguish the orignal points from the internal filled points
        filled_in_points_path = os.path.join(dataset_dir, "internal_filled_points.ply")
        if os.path.exists(filled_in_points_path):
            fill_xyzs = pcu.load_mesh_v(filled_in_points_path)  # [n, 3]
            fill_xyzs = fill_xyzs[
                np.random.choice(
                    fill_xyzs.shape[0], int(fill_xyzs.shape[0] * 1.0), replace=False
                )
            ]
            fill_xyzs = torch.from_numpy(fill_xyzs).float().to("cuda")
            self.fill_xyzs = fill_xyzs
            print(
                "loaded {} internal filled points from: ".format(fill_xyzs.shape[0]),
                filled_in_points_path,
            )
            render_mask_in_sim_pts = torch.cat(
                [
                    torch.ones_like(sim_xyzs[:, 0]).bool(),
                    torch.zeros_like(fill_xyzs[:, 0]).bool(),
                ],
                dim=0,
            ).to(device)
            sim_xyzs = torch.cat([sim_xyzs, fill_xyzs], dim=0)
            self.render_mask = render_mask_in_sim_pts
        else:
            self.fill_xyzs = None
            self.render_mask = torch.ones_like(sim_xyzs[:, 0]).bool().to(device)

        sim_xyzs = (sim_xyzs + shift) / scale
        sim_aabb = torch.stack(
            [torch.min(sim_xyzs, dim=0)[0], torch.max(sim_xyzs, dim=0)[0]], dim=0
        )
        # This AABB is used to constraint the material fields and velocity fields.
        sim_aabb = (
            sim_aabb - torch.mean(sim_aabb, dim=0, keepdim=True)
        ) * 1.2 + torch.mean(sim_aabb, dim=0, keepdim=True)

        print("simulation aabb: ", sim_aabb)

        # point cloud resample with kmeans
        if "downsample_scale" in self.demo_cfg:
            downsample_scale = self.demo_cfg["downsample_scale"]
        else:
            downsample_scale = args.downsample_scale
        if downsample_scale > 0 and downsample_scale < 1.0:
            print("Downsample with ratio: ", downsample_scale)
            num_cluster = int(sim_xyzs.shape[0] * downsample_scale)

            # WARNING: this is a GPU implementation, and will be OOM if the number of points is large
            # you might want to use a CPU implementation if the number of points is large
            # For CPU implementation: uncomment the following lines
            # from local_utils import downsample_with_kmeans
            # sim_xyzs = downsample_with_kmeans(sim_xyzs.detach().cpu().numpy(), num_cluster)
            # sim_xyzs = torch.from_numpy(sim_xyzs).float().to(device)

            sim_xyzs = downsample_with_kmeans_gpu(sim_xyzs, num_cluster)

            sim_gaussian_pos = self.render_params.gaussians.get_xyz.detach().clone()[
                self.sim_mask_in_raw_gaussian, :
            ]
            sim_gaussian_pos = (sim_gaussian_pos + shift) / scale

            # record top k index for each point, to interpolate positions and rotations later
            cdist = torch.cdist(sim_gaussian_pos, sim_xyzs) * -1.0
            _, top_k_index = torch.topk(cdist, self.args.top_k, dim=-1)
            self.top_k_index = top_k_index

            print("Downsampled to: ", sim_xyzs.shape[0], "by", downsample_scale)

        # Compute the volume of each particle
        points_volume = get_volume(sim_xyzs.detach().cpu().numpy())

        num_particles = sim_xyzs.shape[0]
        sim_aabb = torch.stack(
            [torch.min(sim_xyzs, dim=0)[0], torch.max(sim_xyzs, dim=0)[0]], dim=0
        )
        sim_aabb = (
            sim_aabb - torch.mean(sim_aabb, dim=0, keepdim=True)
        ) * 1.2 + torch.mean(sim_aabb, dim=0, keepdim=True)

        # Initialize MPM state and model
        wp.init()
        wp.config.mode = "debug"
        wp.config.verify_cuda = True

        mpm_state = MPMStateStruct()
        mpm_state.init(num_particles, device=device, requires_grad=False)

        self.particle_init_position = sim_xyzs.clone()

        mpm_state.from_torch(
            self.particle_init_position.clone(),
            torch.from_numpy(points_volume).float().to(device).clone(),
            None,
            device=device,
            requires_grad=False,
            n_grid=grid_size,
            grid_lim=1.0,
        )
        mpm_model = MPMModelStruct()
        mpm_model.init(num_particles, device=device, requires_grad=False)
        # grid from [0.0 - 1.0]
        mpm_model.init_other_params(n_grid=grid_size, grid_lim=1.0, device=device)

        material_params = {
            # select from jel
            "material": "jelly",  # "jelly", "metal", "sand", "foam", "snow", "plasticine", "neo-hookean"
            "g": [0.0, 0.0, 0.0],
            "density": 2000,  # kg / m^3
            "grid_v_damping_scale": 1.1,  # no damping if > 1.0
        }
        self.material_name = material_params["material"]
        mpm_solver = MPMWARPDiff(
            num_particles, n_grid=grid_size, grid_lim=1.0, device=device
        )
        mpm_solver.set_parameters_dict(mpm_model, mpm_state, material_params)

        self.mpm_state, self.mpm_model, self.mpm_solver = (
            mpm_state,
            mpm_model,
            mpm_solver,
        )

        # setup boundary condition:
        moving_pts_path = os.path.join(dataset_dir, "moving_part_points.ply")
        assert os.path.exists(
            moving_pts_path
        ), "We need to segment out the moving part to initialize the boundary condition"

        moving_pts = pcu.load_mesh_v(moving_pts_path)
        moving_pts = torch.from_numpy(moving_pts).float().to(device)
        moving_pts = (moving_pts + shift) / scale
        freeze_mask = find_far_points(
            sim_xyzs, moving_pts, thres=0.5 / grid_size
        ).bool()
        freeze_pts = sim_xyzs[freeze_mask, :]

        grid_freeze_mask = apply_grid_bc_w_freeze_pts(
            grid_size, 1.0, freeze_pts, mpm_solver
        )
        self.freeze_mask = freeze_mask

        num_freeze_pts = self.freeze_mask.sum()
        print(
            "num freeze pts in total",
            num_freeze_pts.item(),
            "num moving pts",
            num_particles - num_freeze_pts.item(),
        )

        # init fields for simulation, e.g. density, external force, etc.
        # padd init density, youngs,
        density = (
            torch.ones_like(self.particle_init_position[..., 0])
            * material_params["density"]
        )
        youngs_modulus = (
            torch.ones_like(self.particle_init_position[..., 0])
            * self.E_nu_list[0].detach()
        )
        poisson_ratio = torch.ones_like(self.particle_init_position[..., 0]) * 0.3
        self.density = density
        self.young_modulus = youngs_modulus
        self.poisson_ratio = poisson_ratio

        # set density, youngs, poisson
        mpm_state.reset_density(
            density.clone(),
            torch.ones_like(density).type(torch.int),
            device,
            update_mass=True,
        )
        mpm_solver.set_E_nu_from_torch(
            mpm_model, youngs_modulus.clone(), poisson_ratio.clone(), device
        )
        mpm_solver.prepare_mu_lam(mpm_model, mpm_state, device)

        self.sim_fields = create_spatial_fields(self.args, 1, sim_aabb)
        self.sim_fields.train()

        self.args.sim_res = 24
        # self.velo_fields = create_velocity_model(self.args, sim_aabb)
        self.velo_fields = create_spatial_fields(
            self.args, 3, sim_aabb, add_entropy=False
        )
        self.velo_fields.train()

    def add_constant_force(self, center_point, radius, force, dt, start_time, end_time):
        xyzs = self.particle_init_position.clone() * self.scale - self.shift

        device = "cuda:{}".format(self.accelerator.process_index)
        add_constant_force(
            self.mpm_solver,
            self.mpm_state,
            xyzs,
            center_point,
            radius,
            force,
            dt,
            start_time,
            end_time,
            device=device,
        )

    def get_simulation_input(self, device):
        """
        Outs: All padded
            density: [N]
            young_modulus: [N]
            poisson_ratio: [N]
            velocity: [N, 3]
            query_mask: [N]
            particle_F: [N, 3, 3]
            particle_C: [N, 3, 3]
        """

        density, youngs_modulus, ret_poisson = self.get_material_params(device)
        initial_position_time0 = self.particle_init_position.clone()

        query_mask = torch.logical_not(self.freeze_mask)
        query_pts = initial_position_time0[query_mask, :]

        velocity = self.velo_fields(query_pts)[..., :3]

        # scaling lr is similar to scaling the learning rate of velocity fields.
        velocity = velocity * 0.1  # not padded yet
        ret_velocity = torch.zeros_like(initial_position_time0)
        ret_velocity[query_mask, :] = velocity

        # init F as Idensity Matrix, and C and Zero Matrix
        I_mat = torch.eye(3, dtype=torch.float32).to(device)
        particle_F = torch.repeat_interleave(
            I_mat[None, ...], initial_position_time0.shape[0], dim=0
        )
        particle_C = torch.zeros_like(particle_F)

        return (
            density,
            youngs_modulus,
            ret_poisson,
            ret_velocity,
            query_mask,
            particle_F,
            particle_C,
        )

    def get_material_params(self, device):
        """
        Outs:
            density: [N]
            young_modulus: [N]
            poisson_ratio: [N]
        """

        initial_position_time0 = self.particle_init_position.detach()

        # query the materials params of all particles
        query_pts = initial_position_time0

        sim_params = self.sim_fields(query_pts)

        # scale the output of the network, similar to scale the learning rate
        sim_params = sim_params * 1000
        youngs_modulus = self.young_modulus.detach().clone()
        youngs_modulus += sim_params[..., 0]

        # clamp youngs modulus
        youngs_modulus = torch.clamp(youngs_modulus, 1.0, 5e8)

        density = self.density.detach().clone()
        ret_poisson = self.poisson_ratio.detach().clone()

        return density, youngs_modulus, ret_poisson

    def load(self, checkpoint_dir):
        name_list = [
            "velo_fields",
            "sim_fields",
        ]
        for i, model in enumerate([self.velo_fields, self.sim_fields]):
            model_name = name_list[i]
            model_path = os.path.join(checkpoint_dir, model_name + ".pt")
            if os.path.exists(model_path):
                print("=> loading: ", model_path)
                model.load_state_dict(torch.load(model_path))
            else:
                print("=> not found: ", model_path)

    def setup_render(self, args, gaussian_path, white_background=True):
        """
        1. Load 3D Gaussians in gaussian_path
        2. Prepare rendering params in self.render_params
        3. Load foreground points stored in the same directory as gaussian_path, with name "clean_object_points.ply"
               Only foreground points is used for simulation.
               We will track foreground points with mask: self.sim_mask_in_raw_gaussian
        """

        # setup gaussians
        class RenderPipe(NamedTuple):
            convert_SHs_python = False
            compute_cov3D_python = False
            debug = False

        class RenderParams(NamedTuple):
            render_pipe: RenderPipe
            bg_color: bool
            gaussians: GaussianModel
            camera_list: list

        gaussians = GaussianModel(3)
        camera_list = self.dataset.test_camera_list

        gaussians.load_ply(gaussian_path)
        gaussians.detach_grad()
        print(
            "load gaussians from: {}".format(gaussian_path),
            "... num gaussians: ",
            gaussians._xyz.shape[0],
        )
        bg_color = [1, 1, 1] if white_background else [0, 0, 0]
        background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
        render_pipe = RenderPipe()

        render_params = RenderParams(
            render_pipe=render_pipe,
            bg_color=background,
            gaussians=gaussians,
            camera_list=camera_list,
        )
        self.render_params = render_params

        # segment foreground objects. Foreground points is stored in "clean_object_points.ply",
        #    only foreground points is used for simulation
        #    we will track foreground points with mask: self.sim_mask_in_raw_gaussian
        gaussian_dir = os.path.dirname(gaussian_path)

        clean_points_path = os.path.join(gaussian_dir, "clean_object_points.ply")

        assert os.path.exists(
            clean_points_path
        ), "We need to segment out the forground points to initialize the simulation"

        clean_xyzs = pcu.load_mesh_v(clean_points_path)
        clean_xyzs = torch.from_numpy(clean_xyzs).float().to("cuda")
        self.clean_xyzs = clean_xyzs
        print(
            "loaded {} clean points from: ".format(clean_xyzs.shape[0]),
            clean_points_path,
        )
        not_sim_maks = find_far_points(gaussians._xyz, clean_xyzs, thres=0.01).bool()
        sim_mask_in_raw_gaussian = torch.logical_not(not_sim_maks)
        # [N]
        self.sim_mask_in_raw_gaussian = sim_mask_in_raw_gaussian

    @torch.no_grad()
    def demo(
        self,
        velo_scaling=5.0,
        num_sec=3.0,
        eval_ys=1.0,
        static_camera=False,
        apply_force=False,
        save_name="demo",
    ):

        result_dir = self.demo_cfg["result_dir"]
        if "eval_ys" in self.demo_cfg:
            eval_ys = self.demo_cfg["eval_ys"]
        if "velo_scaling" in self.demo_cfg:
            velo_scaling = self.demo_cfg["velo_scaling"]

        save_name = self.demo_cfg["name"]

        if save_name.startswith("baseline"):
            self.compute_metric(save_name, result_dir)
            return

        # avoid re-run for experiment with the same name
        os.makedirs(result_dir, exist_ok=True)
        pos_path = os.path.join(result_dir, save_name + "_pos.npy")
        if os.path.exists(pos_path):
            pos_array = np.load(pos_path)
        else:
            pos_array = None

        device = "cuda:0"
        data = next(self.dataloader)
        cam = data["cam"][0]

        substep = self.args.substep  # 1e-4

        youngs_modulus = None

        self.sim_fields.eval()
        self.velo_fields.eval()

        (
            density,
            youngs_modulus_,
            poisson,
            init_velocity,
            query_mask,
            particle_F,
            particle_C,
        ) = self.get_simulation_input(device)

        poisson = self.E_nu_list[1].detach().clone()  # override poisson

        if eval_ys < 10:
            youngs_modulus = youngs_modulus_
        else:
            # assign eval_ys to all particles
            youngs_modulus = torch.ones_like(youngs_modulus_) * eval_ys

        # step-1 Setup simulation parameters. External force, or initial velocity.
        #   if --apply_force, we will apply a constant force to points close to the force center
        #   otherwise, we will load the initial velocity from pretrained models, and scale it by velo_scaling.

        delta_time = 1.0 / 30  # 30 fps
        substep_size = delta_time / substep
        num_substeps = int(substep)

        init_xyzs = self.particle_init_position.clone()

        init_velocity[query_mask, :] = init_velocity[query_mask, :] * velo_scaling
        if apply_force:
            init_velocity = torch.zeros_like(init_velocity)

            center_point = (
                torch.from_numpy(self.demo_cfg["center_point"]).to(device).float()
            )
            force = torch.from_numpy(self.demo_cfg["force"]).to(device).float()

            force_duration = self.demo_cfg["force_duration"]  # sec
            force_duration_steps = int(force_duration / delta_time)

            # apply force to points within the radius of the center point
            force_radius = self.demo_cfg["force_radius"]

            self.add_constant_force(
                center_point, force_radius, force, delta_time, 0.0, force_duration
            )

            # prepare to render force in simulated videos:
            #   find the closest point to the force center, and will use it to render the force
            xyzs = self.render_params.gaussians.get_xyz.detach().clone()
            dist = torch.norm(xyzs - center_point.unsqueeze(dim=0), dim=-1)
            closest_idx = torch.argmin(dist)
            closest_xyz = xyzs[closest_idx, :]
            render_force = force / force.norm() * 0.1
            do_render_force = True
        else:
            do_render_force = False

        # step-3: simulation or load the simulated sequence computed before
        #   with the same scene_name and demo_name
        if pos_array is None or save_name == "debug":
            self.mpm_state.reset_density(
                density.clone(), query_mask, device, update_mass=True
            )
            self.mpm_solver.set_E_nu_from_torch(
                self.mpm_model, youngs_modulus.clone(), poisson.clone(), device
            )
            self.mpm_solver.prepare_mu_lam(self.mpm_model, self.mpm_state, device)

            self.mpm_state.continue_from_torch(
                init_xyzs,
                init_velocity,
                particle_F,
                particle_C,
                device=device,
                requires_grad=False,
            )

            # record drive points sequence
            render_pos_list = [(init_xyzs.clone() * self.scale) - self.shift]
            prev_state = self.mpm_state
            for i in tqdm(range(int((30) * num_sec))):
                # iterate over substeps for each frame
                for substep_local in range(num_substeps):
                    next_state = prev_state.partial_clone(requires_grad=False)
                    self.mpm_solver.p2g2p_differentiable(
                        self.mpm_model,
                        prev_state,
                        next_state,
                        substep_size,
                        device=device,
                    )
                    prev_state = next_state

                pos = wp.to_torch(next_state.particle_x).clone()
                # undo scaling and shifting
                pos = (pos * self.scale) - self.shift
                render_pos_list.append(pos)

            # save the sequence of drive points
            numpy_pos = torch.stack(render_pos_list, dim=0).detach().cpu().numpy()
            np.save(pos_path, numpy_pos)
        else:
            render_pos_list = []
            for i in range(pos_array.shape[0]):
                pos = pos_array[i, ...]
                render_pos_list.append(torch.from_numpy(pos).to(device))

        num_pos = len(render_pos_list)
        init_pos = render_pos_list[0].clone()
        pos_diff_list = [_ - init_pos for _ in render_pos_list]

        if not static_camera:
            interpolated_cameras = get_camera_trajectory(
                cam, num_pos, self.demo_cfg["camera_cfg"], self.test_dataset
            )
        else:
            interpolated_cameras = [cam] * num_pos

        if not do_render_force:
            video_array, moving_part_video = (
                render_gaussian_seq_w_mask_with_disp_for_figure(
                    interpolated_cameras,
                    self.render_params,
                    init_pos,
                    self.top_k_index,
                    pos_diff_list,
                    self.sim_mask_in_raw_gaussian,
                )
            )
            video_numpy = video_array.detach().cpu().numpy() * 255
            video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)
            video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])

            moving_part_video = moving_part_video.detach().cpu().numpy() * 255
            moving_part_video = np.clip(moving_part_video, 0, 255).astype(np.uint8)
            moving_part_video = np.transpose(moving_part_video, [0, 2, 3, 1])
        else:
            video_numpy = render_gaussian_seq_w_mask_cam_seq_with_force_with_disp(
                interpolated_cameras,
                self.render_params,
                init_pos,
                self.top_k_index,
                pos_diff_list,
                self.sim_mask_in_raw_gaussian,
                closest_idx,
                render_force,
                force_duration_steps,
            )
            video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])

        if not static_camera:
            save_name = (
                save_name
                + "_movingcamera"
                + "_camid_{}".format(self.demo_cfg["cam_id"])
            )

        save_name = save_name + "_" + self.demo_cfg["name"]
        save_path = os.path.join(result_dir, save_name + ".mp4")

        print("save video to ", save_path)
        save_video_mediapy(video_numpy, save_path, fps=30)

        # save_path = save_path.replace(".mp4", "_moving_part.mp4")
        # save_video_mediapy(moving_part_video, save_path, fps=30)

    def compute_metric(self, exp_name, result_dir):

        data = next(self.dataloader)
        cam = data["cam"][0]

        # step-2 simulation part
        substep = self.args.substep  # 1e-4
        self.sim_fields.eval()
        self.velo_fields.eval()
        device = "cuda:{}".format(self.accelerator.process_index)

        (
            density,
            youngs_modulus,
            poisson,
            init_velocity,
            query_mask,
            particle_F,
            particle_C,
        ) = self.get_simulation_input(device)

        poisson = self.E_nu_list[1].detach().clone()  # override poisson
        # delta_time = 1.0 / (self.num_frames - 1)
        delta_time = 1.0 / 30  # 30 fps
        substep_size = delta_time / substep
        num_substeps = int(delta_time / substep_size)

        init_xyzs = self.particle_init_position.clone()
        init_velocity[query_mask, :] = init_velocity[query_mask, :]

        self.mpm_state.reset_density(
            density.clone(), query_mask, device, update_mass=True
        )
        self.mpm_solver.set_E_nu_from_torch(
            self.mpm_model, youngs_modulus.clone(), poisson.clone(), device
        )
        self.mpm_solver.prepare_mu_lam(self.mpm_model, self.mpm_state, device)

        self.mpm_state.continue_from_torch(
            init_xyzs,
            init_velocity,
            particle_F,
            particle_C,
            device=device,
            requires_grad=False,
        )

        pos_list = [(init_xyzs.clone() * self.scale) - self.shift]

        prev_state = self.mpm_state
        for i in tqdm(range(self.args.num_frames - 1)):
            for substep_local in range(num_substeps):
                next_state = prev_state.partial_clone(requires_grad=False)
                self.mpm_solver.p2g2p_differentiable(
                    self.mpm_model,
                    prev_state,
                    next_state,
                    substep_size,
                    device=device,
                )
                prev_state = next_state

            pos = wp.to_torch(next_state.particle_x).clone()

            # pos = self.mpm_solver.export_particle_x_to_torch().clone()
            pos = (pos * self.scale) - self.shift
            pos_list.append(pos)
        # setup the camera trajectories (copy the static camera for n frames)
        init_pos = pos_list[0].clone()
        pos_diff_list = [_ - init_pos for _ in pos_list]

        interpolated_cameras = [cam] * len(pos_list)

        video_array = render_gaussian_seq_w_mask_with_disp(
            interpolated_cameras,
            self.render_params,
            init_pos,
            self.top_k_index,
            pos_diff_list,
            self.sim_mask_in_raw_gaussian,
        )
        video_numpy = video_array.detach().cpu().numpy() * 255
        video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)
        video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])
        os.makedirs(result_dir, exist_ok=True)
        save_path = os.path.join(
            result_dir,
            exp_name
            + "_jelly_densi2k_video_substep_{}_grid_{}".format(
                substep, self.args.grid_size
            )
            + ".mp4",
        )
        save_path = save_path.replace(".gif", ".mp4")
        save_video_mediapy(video_numpy, save_path, fps=25)

        gt_videos = data["video_clip"][0, 0 : self.num_frames, ...]
        ssim = compute_ssim(video_array, gt_videos)
        psnr = compute_psnr(video_array, gt_videos)

        print("psnr for each frame: ", psnr)
        mean_psnr = psnr.mean().item()
        print("mean psnr: ", mean_psnr, "mean ssim: ", ssim.item())


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="se3_field")
    parser.add_argument("--feat_dim", type=int, default=64)
    parser.add_argument("--num_decoder_layers", type=int, default=3)
    parser.add_argument("--decoder_hidden_size", type=int, default=64)
    # resolution of velocity fields
    parser.add_argument("--spatial_res", type=int, default=32)
    parser.add_argument("--zero_init", type=bool, default=True)

    parser.add_argument("--num_frames", type=str, default=14)

    # resolution of material fields
    parser.add_argument("--sim_res", type=int, default=8)
    parser.add_argument("--sim_output_dim", type=int, default=1)

    parser.add_argument("--downsample_scale", type=float, default=0.1)
    parser.add_argument("--top_k", type=int, default=8)

    # Logging and checkpointing
    parser.add_argument("--output_dir", type=str, default="../../output/inverse_sim")
    parser.add_argument("--seed", type=int, default=0)

    # demo parameters. related to parameters specified in configs/{scene_name}.py
    parser.add_argument("--scene_name", type=str, default="carnation")
    parser.add_argument("--demo_name", type=str, default="inference_demo")
    parser.add_argument("--model_id", type=int, default=0)

    # if eval_ys > 10. Then all the youngs modulus is set to eval_ys homogeneously
    parser.add_argument("--eval_ys", type=float, default=1.0)
    parser.add_argument("--force_id", type=int, default=1)
    parser.add_argument("--force_mag", type=float, default=1.0)
    parser.add_argument("--velo_scaling", type=float, default=5.0)
    parser.add_argument("--point_id", type=int, default=0)
    parser.add_argument("--apply_force", action="store_true", default=False)
    parser.add_argument("--cam_id", type=int, default=0)
    parser.add_argument("--static_camera", action="store_true", default=False)

    args, extra_args = parser.parse_known_args()

    return args


if __name__ == "__main__":
    args = parse_args()

    trainer = Trainer(args)

    trainer.demo(
        velo_scaling=args.velo_scaling,
        eval_ys=args.eval_ys,
        static_camera=args.static_camera,
        apply_force=args.apply_force,
        save_name=args.demo_name,
    )


================================================
FILE: projects/inference/local_utils.py
================================================
import os
import torch
from jaxtyping import Float, Int, Shaped
from torch import Tensor
from time import time
from omegaconf import OmegaConf
from physdreamer.fields.se3_field import TemporalKplanesSE3fields
from physdreamer.fields.triplane_field import TriplaneFields, TriplaneFieldsWithEntropy

from physdreamer.gaussian_3d.gaussian_renderer.render import (
    render_gaussian,
    render_arrow_in_screen,
)
from physdreamer.gaussian_3d.gaussian_renderer.flow_depth_render import (
    render_flow_depth_w_gaussian,
)
import cv2
import numpy as np
from sklearn.cluster import KMeans
from time import time

from physdreamer.gaussian_3d.utils.rigid_body_utils import (
    get_rigid_transform,
    matrix_to_quaternion,
    quaternion_multiply,
)


def cycle(dl: torch.utils.data.DataLoader):
    while True:
        for data in dl:
            yield data


def load_motion_model(model, checkpoint_path):
    model_path = os.path.join(checkpoint_path, "model.pt")
    model.load_state_dict(torch.load(model_path))
    print("load model from: ", model_path)
    return model


def create_spatial_fields(
    args, output_dim, aabb: Float[Tensor, "2 3"], add_entropy=True
):

    sp_res = args.sim_res
    resolutions = [sp_res, sp_res, sp_res]
    reduce = "sum"

    model = TriplaneFields(
        aabb,
        resolutions,
        feat_dim=32,
        init_a=0.1,
        init_b=0.5,
        reduce=reduce,
        num_decoder_layers=2,
        decoder_hidden_size=32,
        output_dim=output_dim,
        zero_init=args.zero_init,
    )
    if args.zero_init:
        print("=> zero init the last layer for Spatial MLP")

    return model


def create_motion_model(
    args,
    aabb: Float[Tensor, "2 3"],
    num_frames=None,
):
    assert args.model in ["se3_field"]

    sp_res = args.spatial_res
    if num_frames is None:
        num_frames = args.num_frames
    resolutions = [sp_res, sp_res, sp_res, (num_frames) // 2 + 1]
    # resolutions = [64, 64, 64, num_frames // 2 + 1]
    reduce = "sum"

    model = TemporalKplanesSE3fields(
        aabb,
        resolutions,
        feat_dim=args.feat_dim,
        init_a=0.1,
        init_b=0.5,
        reduce=reduce,
        num_decoder_layers=args.num_decoder_layers,
        decoder_hidden_size=args.decoder_hidden_size,
        zero_init=args.zero_init,
    )
    if args.zero_init:
        print("=> zero init the last layer for MLP")

    return model


def create_velocity_model(
    args,
    aabb: Float[Tensor, "2 3"],
):

    from physdreamer.fields.offset_field import TemporalKplanesOffsetfields

    sp_res = args.sim_res
    resolutions = [sp_res, sp_res, sp_res, (args.num_frames) // 2 + 1]
    reduce = "sum"
    model = TemporalKplanesOffsetfields(
        aabb,
        resolutions,
        feat_dim=32,
        init_a=0.1,
        init_b=0.5,
        reduce=reduce,
        num_decoder_layers=2,
        decoder_hidden_size=32,
        zero_init=args.zero_init,
    )
    if args.zero_init:
        print("=> zero init the last layer for velocity MLP")
    return model


def create_svd_model(model_name="svd_full", ckpt_path=None):
    state = dict()
    cfg_path_dict = {
        "svd_full": "svd_configs/svd_full_decoder.yaml",
    }
    config = cfg_path_dict[model_name]

    config = OmegaConf.load(config)

    if ckpt_path is not None:
        # overwrite config.
        config.model.params.ckpt_path = ckpt_path

    s_time = time()
    # model will automatically load when create
    from physdreamer.utils.svd_helpper import load_model_from_config

    model, msg = load_model_from_config(config, None)

    state["config"] = config

    print(f"Loading svd model takes {time() - s_time} seconds")

    return model, state


class LinearStepAnneal(object):
    # def __init__(self, total_iters, start_state=[0.02, 0.98], end_state=[0.50, 0.98]):
    def __init__(
        self,
        total_iters,
        start_state=[0.02, 0.98],
        end_state=[0.02, 0.98],
        plateau_iters=-1,
        warmup_step=300,
    ):
        self.total_iters = total_iters

        if plateau_iters < 0:
            plateau_iters = int(total_iters * 0.2)

        if warmup_step <= 0:
            warmup_step = 0

        self.total_iters = max(total_iters - plateau_iters - warmup_step, 10)

        self.start_state = start_state
        self.end_state = end_state
        self.warmup_step = warmup_step

    def compute_state(self, cur_iter):

        if self.warmup_step > 0:
            cur_iter = max(0, cur_iter - self.warmup_step)
        if cur_iter >= self.total_iters:
            return self.end_state
        ret = []
        for s, e in zip(self.start_state, self.end_state):
            ret.append(s + (e - s) * cur_iter / self.total_iters)
        return ret


def setup_boundary_condition(
    xyzs_over_time: torch.Tensor, mpm_solver, mpm_state, num_filled=0
):

    init_velocity = xyzs_over_time[1] - xyzs_over_time[0]
    init_velocity_mag = torch.norm(init_velocity, dim=-1)

    # 10% of the velocity
    velocity_thres = torch.quantile(init_velocity_mag, 0.1, dim=0)

    # [n_particles]. 1 for freeze, 0 for moving
    freeze_mask = init_velocity_mag < velocity_thres
    freeze_mask = freeze_mask.type(torch.int)
    if num_filled > 0:
        freeze_mask = torch.cat(
            [freeze_mask, freeze_mask.new_zeros(num_filled).type(torch.int)], dim=0
        )
    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from static points", num_freeze_pts.item())

    free_velocity = torch.zeros_like(init_velocity[0])  # [3] in device

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=100000
    )

    return freeze_mask


def setup_plannar_boundary_condition(
    xyzs_over_time: torch.Tensor,
    mpm_solver,
    mpm_state,
    gaussian_xyz,
    plane_mean,
    plane_normal,
    thres=0.2,
):
    """
    plane_mean and plane_normal are in original coordinate, not being normalized
    Args:
        xyzs_over_time: [T, N, 3]
        gaussian_xyz: [N, 3] torch.Tensor
        plane_mean: [3]
        plane_normal: [3]
        thres: float

    """

    plane_normal = plane_normal / torch.norm(plane_normal)
    # [n_particles]
    plane_dist = torch.abs(
        torch.sum(
            (gaussian_xyz - plane_mean.unsqueeze(0)) * plane_normal.unsqueeze(0), dim=-1
        )
    )
    # [n_particles]
    freeze_mask = plane_dist < thres
    freeze_mask = freeze_mask.type(torch.int)

    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from plannar boundary", num_freeze_pts.item())
    free_velocity = xyzs_over_time.new_zeros(3)
    # print("free velocity", free_velocity.shape, freeze_mask.shape)

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=100000
    )

    return freeze_mask


def find_far_points(xyzs, selected_points, thres=0.05):
    """
    Args:
        xyzs: [N, 3]
        selected_points: [M, 3]
    Outs:
        freeze_mask: [N], 1 for points that are far away, 0 for points that are close
                    dtype=torch.int
    """
    chunk_size = 10000

    freeze_mask_list = []
    for i in range(0, xyzs.shape[0], chunk_size):

        end_index = min(i + chunk_size, xyzs.shape[0])
        xyzs_chunk = xyzs[i:end_index]
        # [M, N]
        cdist = torch.cdist(xyzs_chunk, selected_points)

        min_dist, _ = torch.min(cdist, dim=-1)
        freeze_mask = min_dist > thres
        freeze_mask = freeze_mask.type(torch.int)
        freeze_mask_list.append(freeze_mask)

    freeze_mask = torch.cat(freeze_mask_list, dim=0)

    # 1 for points that are far away, 0 for points that are close
    return freeze_mask


def setup_boundary_condition_with_points(
    xyzs, selected_points, mpm_solver, mpm_state, thres=0.05
):
    """
    Args:
        xyzs: [N, 3]
        selected_points: [M, 3]
    """

    freeze_mask = find_far_points(xyzs, selected_points, thres=thres)
    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from static points", num_freeze_pts.item())

    free_velocity = torch.zeros_like(xyzs[0])  # [3] in device

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=1000000
    )

    return freeze_mask


def setup_bottom_boundary_condition(xyzs, mpm_solver, mpm_state, percentile=0.05):
    """
    Args:
        xyzs: [N, 3]
        selected_points: [M, 3]
    """
    max_z, min_z = torch.max(xyzs[:, 2]), torch.min(xyzs[:, 2])
    thres = min_z + (max_z - min_z) * percentile
    freeze_mask = xyzs[:, 2] < thres

    freeze_mask = freeze_mask.type(torch.int)
    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from bottom points", num_freeze_pts.item())

    free_velocity = torch.zeros_like(xyzs[0])  # [3] in device

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=1000000
    )

    return freeze_mask


def render_single_view_video(
    cam,
    render_params,
    motion_model,
    time_stamps,
    rand_bg=False,
    render_flow=False,
    query_mask=None,
):
    """
    Args:
        cam:
        motion_model: Callable function, f(x, t) => translation, rotation
        time_stamps: [T]
        query_mask: Tensor of [N], 0 for freeze points, 1 for moving points
    Outs:
        ret_video: [T, 3, H, W] value in [0, 1]
    """

    if rand_bg:
        bg_color = torch.rand(3, device="cuda")
    else:
        bg_color = render_params.bg_color

    ret_img_list = []
    for time_stamp in time_stamps:
        if not render_flow:
            new_gaussians = render_params.gaussians.apply_se3_fields(
                motion_model, time_stamp
            )
            if query_mask is not None:
                new_gaussians._xyz = new_gaussians._xyz * query_mask.unsqueeze(
                    -1
                ) + render_params.gaussians._xyz * (1 - query_mask.unsqueeze(-1))
                new_gaussians._rotation = (
                    new_gaussians._rotation * query_mask.unsqueeze(-1)
                    + render_params.gaussians._rotation * (1 - query_mask.unsqueeze(-1))
                )
            # [3, H, W]
            img = render_gaussian(
                cam,
                new_gaussians,
                render_params.render_pipe,
                bg_color,
            )[
                "render"
            ]  # value in [0, 1]
        else:
            inp_time = (
                torch.ones_like(render_params.gaussians._xyz[:, 0:1]) * time_stamp
            )
            inp = torch.cat([render_params.gaussians._xyz, inp_time], dim=-1)
            # [bs, 3, 3]. [bs, 3]
            R, point_disp = motion_model(inp)

            img = render_flow_depth_w_gaussian(
                cam,
                render_params.gaussians,
                render_params.render_pipe,
                point_disp,
                bg_color,
            )["render"]

        ret_img_list.append(img[None, ...])

    ret_video = torch.cat(ret_img_list, dim=0)  # [T, 3, H, W]
    return ret_video


def render_gaussian_seq(cam, render_params, gaussian_pos_list, gaussian_cov_list):

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz = xyz
        # TODO, how to deal with cov
        img = render_gaussian(
            cam,
            gaussians,
            render_params.render_pipe,
            background,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def render_gaussian_seq_w_mask(
    cam, render_params, gaussian_pos_list, gaussian_cov_list, update_mask
):

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_cov = gaussians.get_covariance().clone()

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz[update_mask, ...] = xyz

        if gaussian_cov_list is not None:
            cov = gaussian_cov_list[i]
            old_cov[update_mask, ...] = cov
            cov3D_precomp = old_cov

        else:
            cov3D_precomp = None

        img = render_gaussian(
            cam,
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def render_gaussian_seq_w_mask_with_disp(
    cam, render_params, orign_points, top_k_index, disp_list, update_mask
):
    """
    Args:
        cam: Camera or list of Camera
        orign_points: [m, 3]
        disp_list: List[m, 3]
        top_k_index: [n, top_k]

    """

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_rotation = gaussians._rotation.clone()

    query_pts = old_xyz[update_mask, ...]
    query_rotation = old_rotation[update_mask, ...]

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(disp_list)):

        if isinstance(cam, list):
            render_cam = cam[i]
        else:
            render_cam = cam
        disp = disp_list[i]
        new_xyz, new_rotation = interpolate_points_w_R(
            query_pts, query_rotation, orign_points, disp, top_k_index
        )
        gaussians._xyz[update_mask, ...] = new_xyz
        gaussians._rotation[update_mask, ...] = new_rotation

        cov3D_precomp = None

        img = render_gaussian(
            render_cam,
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    gaussians._rotation = old_rotation
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def render_gaussian_seq_w_mask_with_disp_for_figure(
    cam, render_params, orign_points, top_k_index, disp_list, update_mask
):
    """
    Args:
        cam: Camera or list of Camera
        orign_points: [m, 3]
        disp_list: List[m, 3]
        top_k_index: [n, top_k]

    """

    ret_img_list = []
    moving_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_rotation = gaussians._rotation.clone()

    query_pts = old_xyz[update_mask, ...]
    query_rotation = old_rotation[update_mask, ...]

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    background_black = torch.tensor([0, 0, 0], dtype=torch.float32, device="cuda")
    for i in range(len(disp_list)):

        if isinstance(cam, list):
            render_cam = cam[i]
        else:
            render_cam = cam
        disp = disp_list[i]
        new_xyz, new_rotation = interpolate_points_w_R(
            query_pts, query_rotation, orign_points, disp, top_k_index
        )
        gaussians._xyz[update_mask, ...] = new_xyz
        gaussians._rotation[update_mask, ...] = new_rotation

        cov3D_precomp = None

        img = render_gaussian(
            render_cam,
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        masked_gaussians = gaussians.apply_mask(update_mask)
        moving_img = render_gaussian(
            render_cam,
            masked_gaussians,
            render_params.render_pipe,
            background_black,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])
        moving_img_list.append(moving_img[None, ...])

    gaussians._xyz = old_xyz  # set back
    gaussians._rotation = old_rotation
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)
    moving_part_video = torch.cat(moving_img_list, dim=0)

    return rendered_video, moving_part_video


def render_gaussian_seq_w_mask_cam_seq(
    cam_list, render_params, gaussian_pos_list, gaussian_cov_list, update_mask
):

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_cov = gaussians.get_covariance().clone()

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz[update_mask, ...] = xyz

        if gaussian_cov_list is not None:
            cov = gaussian_cov_list[i]
            old_cov[update_mask, ...] = cov
            cov3D_precomp = old_cov

        else:
            cov3D_precomp = None

        img = render_gaussian(
            cam_list[i],
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def apply_grid_bc_w_freeze_pts(grid_size, grid_lim, freeze_pts, mpm_solver):

    device = freeze_pts.device

    grid_pts_cnt = torch.zeros(
        (grid_size, grid_size, grid_size), dtype=torch.int32, device=device
    )

    dx = grid_lim / grid_size
    inv_dx = 1.0 / dx

    freeze_pts = (freeze_pts * inv_dx).long()

    for x, y, z in freeze_pts:
        grid_pts_cnt[x, y, z] += 1

    freeze_grid_mask = grid_pts_cnt >= 1

    freeze_grid_mask_int = freeze_grid_mask.type(torch.int32)

    number_freeze_grid = freeze_grid_mask_int.sum().item()
    print("number of freeze grid", number_freeze_grid)

    mpm_solver.enforce_grid_velocity_by_mask(freeze_grid_mask_int)

    # add debug section:

    return freeze_grid_mask


def add_constant_force(
    mpm_sovler,
    mpm_state,
    xyzs,
    center_point,
    radius,
    force,
    dt,
    start_time,
    end_time,
    device,
):
    """
    Args:
        xyzs: [N, 3]
        center_point: [3]
        radius: float
        force: [3]

    """

    # compute distance from xyzs to center_point
    # [N]
    dist = torch.norm(xyzs - center_point.unsqueeze(0), dim=-1)

    apply_force_mask = dist < radius
    apply_force_mask = apply_force_mask.type(torch.int)

    print(apply_force_mask.shape, apply_force_mask.sum().item(), "apply force mask")

    mpm_sovler.add_impulse_on_particles_with_mask(
        mpm_state,
        force,
        dt,
        apply_force_mask,
        start_time=start_time,
        end_time=end_time,
        device=device,
    )


@torch.no_grad()
def render_force_2d(cam, render_params, center_point, force):

    force_in_2d_scale = 80  # unit as pixel
    two_points = torch.stack([center_point, center_point + force], dim=0)

    gaussians = render_params.gaussians
    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")

    # [3, H, W]
    img = render_gaussian(
        cam,
        gaussians,
        render_params.render_pipe,
        background,
    )["render"]
    img = img.detach().contiguous()
    img = img.cpu().numpy().transpose(1, 2, 0)
    img = img * 255
    img = img.astype(np.uint8).copy()

    # two_points.  [2, 3]
    # arrow_2d: [2, 2]
    arrow_2d = render_arrow_in_screen(cam, two_points)

    arrow_2d = arrow_2d.cpu().numpy()

    start, vec_2d = arrow_2d[0], arrow_2d[1] - arrow_2d[0]
    vec_2d = vec_2d / np.linalg.norm(vec_2d)

    start = start  # + np.array([540.0, 288.0])  # [W, H] / 2
    # debug here.
    # 1. unit in pixel?
    # 2. use cv2 to add arrow?
    # draw cirrcle at start in img

    # img = img.transpose(2, 0, 1)
    img = cv2.circle(img, (int(start[0]), int(start[1])), 40, (255, 255, 255), 8)

    # draw arrow in img
    end = start + vec_2d * force_in_2d_scale
    end = end.astype(np.int32)
    start = start.astype(np.int32)
    img = cv2.arrowedLine(img, (start[0], start[1]), (end[0], end[1]), (0, 255, 255), 8)

    return img


def render_gaussian_seq_w_mask_cam_seq_with_force(
    cam_list,
    render_params,
    gaussian_pos_list,
    gaussian_cov_list,
    update_mask,
    pts_index,
    force,
    force_steps,
):

    force_in_2d_scale = 80  # unit as pixel
    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_cov = gaussians.get_covariance().clone()

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz[update_mask, ...] = xyz

        if gaussian_cov_list is not None:
            cov = gaussian_cov_list[i]
            old_cov[update_mask, ...] = cov
            cov3D_precomp = old_cov

        else:
            cov3D_precomp = None

        img = render_gaussian(
            cam_list[i],
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        # to [H, W, 3]
        img = img.detach().contiguous().cpu().numpy().transpose(1, 2, 0)
        img = np.clip((img * 255), 0, 255).astype(np.uint8).copy()

        if i < force_steps:
            center_point = gaussians._xyz[pts_index]
            two_points = torch.stack([center_point, center_point + force], dim=0)

            arrow_2d = render_arrow_in_screen(cam_list[i], two_points)

            arrow_2d = arrow_2d.cpu().numpy()

            start, vec_2d = arrow_2d[0], arrow_2d[1] - arrow_2d[0]
            vec_2d = vec_2d / np.linalg.norm(vec_2d)

            start = start  # + np.array([540.0, 288.0])

            img = cv2.circle(
                img, (int(start[0]), int(start[1])), 40, (255, 255, 255), 8
            )

            # draw arrow in img
            end = start + vec_2d * force_in_2d_scale
            end = end.astype(np.int32)
            start = start.astype(np.int32)
            img = cv2.arrowedLine(
                img, (start[0], start[1]), (end[0], end[1]), (0, 255, 255), 8
            )

        img = img.transpose(2, 0, 1)
        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = np.concatenate(ret_img_list, axis=0)

    return rendered_video


def render_gaussian_seq_w_mask_cam_seq_with_force_with_disp(
    cam_list,
    render_params,
    orign_points,
    top_k_index,
    disp_list,
    update_mask,
    pts_index,
    force,
    force_steps,
):

    force_in_2d_scale = 80  # unit as pixel
    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_rotation = gaussians._rotation.clone()

    query_pts = old_xyz[update_mask, ...]
    query_rotation = old_rotation[update_mask, ...]

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(disp_list)):

        disp = disp_list[i]
        new_xyz, new_rotation = interpolate_points_w_R(
            query_pts, query_rotation, orign_points, disp, top_k_index
        )
        gaussians._xyz[update_mask, ...] = new_xyz
        gaussians._rotation[update_mask, ...] = new_rotation

        cov3D_precomp = None

        img = render_gaussian(
            cam_list[i],
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        # to [H, W, 3]
        img = img.detach().contiguous().cpu().numpy().transpose(1, 2, 0)
        img = np.clip((img * 255), 0, 255).astype(np.uint8).copy()

        if i < force_steps:
            center_point = gaussians._xyz[pts_index]
            two_points = torch.stack([center_point, center_point + force], dim=0)

            arrow_2d = render_arrow_in_screen(cam_list[i], two_points)

            arrow_2d = arrow_2d.cpu().numpy()

            start, vec_2d = arrow_2d[0], arrow_2d[1] - arrow_2d[0]
            vec_2d = vec_2d / np.linalg.norm(vec_2d)

            start = start  # + np.array([540.0, 288.0])

            img = cv2.circle(
                img, (int(start[0]), int(start[1])), 40, (255, 255, 255), 8
            )

            # draw arrow in img
            end = start + vec_2d * force_in_2d_scale
            end = end.astype(np.int32)
            start = start.astype(np.int32)
            img = cv2.arrowedLine(
                img, (start[0], start[1]), (end[0], end[1]), (0, 255, 255), 8
            )

        img = img.transpose(2, 0, 1)
        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    gaussians._rotation = old_rotation
    # [T, C, H, W], in [0, 1]
    rendered_video = np.concatenate(ret_img_list, axis=0)

    return rendered_video


def downsample_with_kmeans(points_array: np.ndarray, num_points: int):
    """
    Args:
        points_array: [N, 3]
        num_points: int
    Outs:
        downsampled_points: [num_points, 3]
    """

    print(
        "=> staring downsample with kmeans from ",
        points_array.shape[0],
        " points to ",
        num_points,
        " points",
    )
    s_time = time()
    kmeans = KMeans(n_clusters=num_points, random_state=0).fit(points_array)
    cluster_centers = kmeans.cluster_centers_
    e_time = time()

    print("=> downsample with kmeans takes ", e_time - s_time, " seconds")
    return cluster_centers


@torch.no_grad()
def downsample_with_kmeans_gpu(points_array: torch.Tensor, num_points: int):

    from kmeans_gpu import KMeans

    kmeans = KMeans(
        n_clusters=num_points,
        max_iter=100,
        tolerance=1e-4,
        distance="euclidean",
        sub_sampling=None,
        max_neighbors=15,
    )

    features = torch.ones(1, 1, points_array.shape[0], device=points_array.device)
    points_array = points_array.unsqueeze(0)
    # Forward

    print(
        "=> staring downsample with kmeans from ",
        points_array.shape[1],
        " points to ",
        num_points,
        " points",
    )
    s_time = time()
    centroids, features = kmeans(points_array, features)

    ret_points = centroids.squeeze(0)
    e_time = time()
    print("=> downsample with kmeans takes ", e_time - s_time, " seconds")

    # [np_subsample, 3]
    return ret_points


@torch.no_grad()
def downsample_with_kmeans_gpu_with_chunk(points_array: torch.Tensor, num_points: int):
    # split the points_array into chunks, and then do kmeans on each chunk
    #   to save memory.

    from kmeans_gpu import KMeans

    points_array_sum = points_array.sum(dim=1)
    arg_idx = torch.argsort(points_array_sum, descending=True)
    points_array = points_array[arg_idx, :]

    features = torch.ones(1, 1, points_array.shape[0], device=points_array.device)
    points_array = points_array.unsqueeze(0)
    # Forward

    print(
        "=> staring downsample with kmeans from ",
        points_array.shape[1],
        " points to ",
        num_points,
        " points",
        points_array.shape,
    )
    s_time = time()

    num_raw_points = points_array.shape[1]
    chunk_size = 150000

    num_chunks = num_raw_points // chunk_size + 1

    ret_list = []
    for i in range(num_chunks):

        start = i * chunk_size
        end = min((i + 1) * chunk_size, num_raw_points)
        points_chunk = points_array[:, start:end, :]
        features_chunk = features[:, :, start:end]

        num_target_points = min(chunk_size, num_points // num_chunks)

        kmeans = KMeans(
            n_clusters=num_target_points,
            max_iter=100,
            tolerance=1e-4,
            distance="euclidean",
            sub_sampling=None,
            max_neighbors=15,
        )
        centroids, _ = kmeans(points_chunk, features_chunk)
        ret_list.append(centroids.squeeze(0))

    ret_points = torch.cat(ret_list, dim=0)
    e_time = time()
    print("=> downsample with kmeans takes ", e_time - s_time, " seconds")

    # [np_subsample, 3]
    return ret_points


def interpolate_points(query_points, drive_displacement, top_k_index):
    """
    Args:
        query_points: [n, 3]
        drive_displacement: [m, 3]
        top_k_index: [n, top_k] < m
    """

    top_k_disp = drive_displacement[top_k_index]

    t = top_k_disp.mean(dim=1)

    ret_points = query_points + t

    return ret_points


def interpolate_points_w_R(
    query_points, query_rotation, drive_origin_pts, drive_displacement, top_k_index
):
    """
    Args:
        query_points: [n, 3]
        drive_origin_pts: [m, 3]
        drive_displacement: [m, 3]
        top_k_index: [n, top_k] < m

    Or directly call: apply_discrete_offset_filds_with_R(self, origin_points, offsets, topk=6):
        Args:
            origin_points: (N_r, 3)
            offsets: (N_r, 3)
        in rendering
    """

    # [n, topk, 3]
    top_k_disp = drive_displacement[top_k_index]
    source_points = drive_origin_pts[top_k_index]

    R, t = get_rigid_transform(source_points, source_points + top_k_disp)

    avg_offsets = top_k_disp.mean(dim=1)

    ret_points = query_points + avg_offsets

    new_rotation = quaternion_multiply(matrix_to_quaternion(R), query_rotation)

    return ret_points, new_rotation


def create_camera_path(
    cam,
    radius: float,
    focus_pt: np.ndarray = np.array([0, 0, 0]),
    up: np.ndarray = np.array([0, 0, 1]),
    n_frames: int = 60,
    n_rots: int = 1,
    y_scale: float = 1.0,
):

    R, T = cam.R, cam.T
    # R, T = R.cpu().numpy(), T.cpu().numpy()

    Rt = np.zeros((4, 4))
    Rt[:3, :3] = R.transpose()
    Rt[:3, 3] = T
    Rt[3, 3] = 1.0
    C2W = np.linalg.inv(Rt)
    C2W[:3, 1:3] *= -1

    import copy
    from physdreamer.utils.camera_utils import generate_spiral_path
    from physdreamer.data.cameras import Camera

    lookat_pt = focus_pt
    render_poses = generate_spiral_path(
        C2W, radius, lookat_pt, up, n_frames, n_rots, y_scale
    )

    FoVy, FoVx = cam.FoVy, cam.FoVx
    height, width = cam.image_height, cam.image_width

    ret_cam_list = []
    for i in range(n_frames):
        c2w_opengl = render_poses[i]
        c2w = copy.deepcopy(c2w_opengl)
        c2w[:3, 1:3] *= -1

        # get the world-to-camera transform and set R, T
        w2c = np.linalg.inv(c2w)
        R = np.transpose(
            w2c[:3, :3]
        )  # R is stored transposed due to 'glm' in CUDA code
        T = w2c[:3, 3]
        cam = Camera(
            R=R,
            T=T,
            FoVy=FoVy,
            FoVx=FoVx,
            img_path=None,
            img_hw=(height, width),
            timestamp=None,
            data_device="cuda",
        )
        ret_cam_list.append(cam)

    return ret_cam_list


def get_camera_trajectory(cam, num_pos, camera_cfg: dict, dataset):
    if camera_cfg["type"] == "spiral":
        interpolated_cameras = create_camera_path(
            cam,
            radius=camera_cfg["radius"],
            focus_pt=camera_cfg["focus_point"],
            up=camera_cfg["up"],
            n_frames=num_pos,
        )
    elif camera_cfg["type"] == "interpolation":
        if "start_frame" in camera_cfg and "end_frame" in camera_cfg:
            interpolated_cameras = dataset.interpolate_camera(
                camera_cfg["start_frame"], camera_cfg["end_frame"], num_pos
            )
        else:
            interpolated_cameras = dataset.interpolate_camera(
                camera_cfg["start_frame"], camera_cfg["start_frame"], num_pos
            )

    print(
        "number of simulated frames: ",
        num_pos,
        "num camera viewpoints: ",
        len(interpolated_cameras),
    )
    return interpolated_cameras


================================================
FILE: projects/inference/run.sh
================================================
# python3 demo.py --scene_name carnation --apply_force --force_id 1  --point_id 0 --force_mag 2.0 --cam_id 0

# python3 demo.py --scene_name hat --apply_force --force_id 0  --point_id 0 --force_mag 3.0 --cam_id 0

python3 demo.py --scene_name telephone --apply_force --force_id 0  --point_id 0 --force_mag 0.1 --cam_id 0

python3 demo.py --scene_name alocasia --apply_force --force_id 0 --point_id 0 --force_mag 3.0 --cam_id 0


================================================
FILE: projects/uncleaned_train/.gitignore
================================================
img_data/
tmp/
./data/
dataset/
models/
model
output/
outputs/
*.sh
exp_motion/*.sh
__pycache__
*__pycache__/
*/__pycache__/
*/wandb/*
wandb
*/*.pyc
*.sh.log*
*.gif
*.mp4
*.pt
*.ipynb


================================================
FILE: projects/uncleaned_train/README.md
================================================
This folder contains the original uncleaned training code. This folder can be viewed as an independent folder, it did not use code in physdreamer/ and projects/inference

`exp_motion/train` contains code for velocity and material training. 

Velocity train and material train is slightly different:
1. How many frames is used for training.
2. How many frames the backprop needs to be passed. 
3. Velocity train typically use smaller spatial resolution(grid_size) and temporal resolution(num of substeps). 

Two major difference for this code with the inference code is that:
1. All the helper functions here are all installed in a folder called "motionrep". The inference code uses "physdreamer". the physdreamer/ and motionrep/ folder should share most of the code
2. The config.yaml file has different contents and format


================================================
FILE: projects/uncleaned_train/exp_motion/train/config.yml
================================================
dataset_dir: 
# optimization
warmup_step: 10
max_grad_norm: 10.0

rand_bg: False

velo_dir: [
  "../../data/physics_dreamer/alocasia_nerfstudio/mul_videos/velopretrain_models/frame_00037_mb-8_fps-30_8",
]


================================================
FILE: projects/uncleaned_train/exp_motion/train/config_demo.py
================================================
import numpy as np

from model_config import (
    model_list,
    camera_cfg_list,
    points_list,
    force_directions,
    simulate_cfg,
    dataset_dir,
    result_dir,
    exp_name,
)


class DemoParams(object):
    def __init__(self):

        self.demo_dict = {
            "baseline": {
                "model_path": model_list[0],
                "substep": 768,
                "grid_size": 64,
                "name": "baseline",
                "camera_cfg": camera_cfg_list[0],
                "cam_id": 0,
            },
            "demo_dummy": {
                "model_path": model_list[0],
                "center_point": points_list[0],
                "force": np.array([0.15, 0, 0]),
                "camera_cfg": camera_cfg_list[0],
                "force_duration": 0.75,
                "force_radius": 0.1,
                "substep": 256,
                "grid_size": 96,
                "total_time": 5,
                "name": "alocasia_sv_gres96_substep256_force_top_of_flower",
            },
        }

    def get_cfg(
        self,
        demo_name=None,
        model_id: int = 0,
        eval_ys: float = 1.0,
        force_id: int = 0,
        force_mag: float = 1.0,
        velo_scaling: float = 3.0,
        point_id: int = 0,
        cam_id: int = 0,
        apply_force: bool = False,
    ):
        if demo_name == "None":
            demo_name = None
        if (demo_name is not None) and (demo_name in self.demo_dict):
            cfg = self.demo_dict[demo_name]
        else:
            cfg = {}
            cfg["model_path"] = model_list[model_id]
            cfg["center_point"] = points_list[point_id]
            cfg["force"] = force_directions[force_id] * force_mag
            cfg["camera_cfg"] = camera_cfg_list[cam_id]
            cfg["cam_id"] = cam_id
            cfg["force_duration"] = 0.75
            cfg["force_radius"] = 0.1
            cfg["substep"] = simulate_cfg["substep"]
            cfg["grid_size"] = simulate_cfg["grid_size"]
            cfg["total_time"] = 5
            cfg["eval_ys"] = eval_ys
            cfg["velo_scaling"] = velo_scaling

            if demo_name is None:
                name = ""
            else:
                name = demo_name + "_"
            name = (
                name + f"{exp_name}_sv_gres{cfg['grid_size']}_substep{cfg['substep']}"
            )
            if eval_ys > 10:
                name += f"_eval_ys_{eval_ys}"
            else:
                name += f"_model_{model_id}"

            if apply_force:
                name += f"_force_{force_id}_mag_{force_mag}_point_{point_id}"
            else:
                name += f"_no_force_velo_{velo_scaling}"
            cfg["name"] = name

        cfg["dataset_dir"] = dataset_dir
        cfg["result_dir"] = result_dir

        return cfg


================================================
FILE: projects/uncleaned_train/exp_motion/train/convert_gaussian_to_mesh.py
================================================
import os
from random import gauss
from fire import Fire
from motionrep.gaussian_3d.scene import GaussianModel
import numpy as np
import torch


def convert_gaussian_to_mesh(gaussian_path, thresh=0.1, save_path=None):
    if save_path is None:
        dir_path = os.path.dirname(gaussian_path)
        save_path = os.path.join(dir_path, "gaussian_to_mesh_thres_{}.obj".format(thresh))

    gaussian_path = os.path.join(gaussian_path)

    gaussians = GaussianModel(3)

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()
    print(
        "load gaussians from: {}".format(gaussian_path),
        "... num gaussians: ",
        gaussians._xyz.shape[0],
    )

    mesh = gaussians.extract_mesh(
        save_path, density_thresh=thresh, resolution=128, decimate_target=1e5
    )

    mesh.write(save_path)


def internal_filling(gaussian_path, thresh=2.0,  save_path=None, resolution=256, 
                     num_pts=4):
    if save_path is None:
        dir_path = os.path.dirname(gaussian_path)
        save_path = os.path.join(dir_path, "gaussian_internal_fill.ply")

    gaussians = GaussianModel(3)

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()

    print(
        "load gaussians from: {}".format(gaussian_path),
        "... num gaussians: ",
        gaussians._xyz.shape[0],
    )

    # [res, res, res]
    # torch.linspace(-1, 1, resolution) for the coords
    # x[0] => -1,  x[resolution-1] = 1
    # x[i] = -1 + i * 2 / (resolution - 1)
    # index_x = (x[i] + 1) / 2 * (resolution - 1)
    occ = (
        gaussians.extract_fields(resolution=resolution, num_blocks=16, relax_ratio=1.5)
        .detach()
        .cpu()
        .numpy()
    )

    xyzs = gaussians._xyz.detach().cpu().numpy()

    center = gaussians.center.detach().cpu().numpy()
    scale = gaussians.scale # float
    xyzs = (xyzs - center) * scale # [-1, 1]?

    percentile = [95, 97, 99][1]

    # from IPython import embed
    # embed()

    thres_ = np.percentile(occ, percentile)
    print("density threshold: {:.5f} -- in percentile: {:.1f} ".format(thres_, percentile))
    occ_large_thres = occ > thresh
    # get the xyz of the occupied voxels
    # xyz = np.argwhere(occ)
    # normalize to [-1, 1]
    # xyz = xyz / (resolution - 1) * 2 - 1

    voxel_counts = np.zeros((resolution, resolution, resolution))

    points_xyzindex = ((xyzs + 1) / 2 * (resolution - 1)).astype(np.uint32)

    for x, y, z in points_xyzindex:
        voxel_counts[x, y, z] += 1
    
    add_points = np.logical_and(occ_large_thres, voxel_counts <= 1)

    add_xyz = np.argwhere(add_points).astype(np.float32)
    add_xyz = add_xyz / (resolution - 1) * 2 - 1  # [x,y,z]_min of the unit cell.  randomly add points in the unit cell

    cell_width = 2.0 / (resolution - 1)

    # copy add_xyz "num_pts" times
    add_xyz = np.repeat(add_xyz, num_pts, axis=0)

    random_offset_within_cell = np.random.uniform(-cell_width / 2, cell_width / 2, size=add_xyz.shape)
    add_xyz += random_offset_within_cell

    all_xyz = np.concatenate([xyzs, add_xyz], axis=0)

    print("added points: ", add_xyz.shape[0])
    
    # save to ply
    import point_cloud_utils as pcu

    # pcu.save_mesh_vf(save_path, all_xyz, np.zeros((0, 3), dtype=np.int32))

    add_path = os.path.join(os.path.dirname(save_path), "extra_filled_points_thresh_{}.ply".format(thresh))
    pcu.save_mesh_v(add_path, add_xyz)

    
if __name__ == "__main__":
    Fire(convert_gaussian_to_mesh)
    # Fire(internal_filling)


================================================
FILE: projects/uncleaned_train/exp_motion/train/fast_train_velocity.py
================================================
import argparse
import os
import numpy as np
import torch
from tqdm import tqdm

from torch import Tensor
from jaxtyping import Float, Int, Shaped
from typing import List

import point_cloud_utils as pcu

from accelerate.utils import ProjectConfiguration
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from accelerate import Accelerator, DistributedDataParallelKwargs

import numpy as np
import logging
import argparse
import shutil
import wandb
import torch
import os
from motionrep.utils.config import create_config
from motionrep.utils.optimizer import get_linear_schedule_with_warmup
from time import time
from omegaconf import OmegaConf
import numpy as np

# from motionrep.utils.torch_utils import get_sync_time
from einops import rearrange, repeat

from motionrep.gaussian_3d.gaussian_renderer.feat_render import render_feat_gaussian
from motionrep.gaussian_3d.scene import GaussianModel

from motionrep.data.datasets.multiview_dataset import MultiviewImageDataset
from motionrep.data.datasets.multiview_video_dataset import (
    MultiviewVideoDataset,
    camera_dataset_collate_fn,
)

from motionrep.data.datasets.multiview_dataset import (
    camera_dataset_collate_fn as camera_dataset_collate_fn_img,
)

from typing import NamedTuple
import torch.nn.functional as F

from motionrep.utils.img_utils import compute_psnr, compute_ssim
from thirdparty_code.warp_mpm.mpm_data_structure import (
    MPMStateStruct,
    MPMModelStruct,
    get_float_array_product,
)
from thirdparty_code.warp_mpm.mpm_solver_diff import MPMWARPDiff
from thirdparty_code.warp_mpm.warp_utils import from_torch_safe
from thirdparty_code.warp_mpm.gaussian_sim_utils import get_volume
import warp as wp
import random

from local_utils import (
    cycle,
    create_spatial_fields,
    find_far_points,
    LinearStepAnneal,
    apply_grid_bc_w_freeze_pts,
    render_gaussian_seq_w_mask_with_disp,
    downsample_with_kmeans_gpu,
)
from interface import MPMDifferentiableSimulation

logger = get_logger(__name__, log_level="INFO")


def create_dataset(args):
    assert args.dataset_res in ["middle", "small", "large"]
    if args.dataset_res == "middle":
        res = [320, 576]
    elif args.dataset_res == "small":
        res = [192, 320]
    elif args.dataset_res == "large":
        res = [576, 1024]
    else:
        raise NotImplementedError

    video_dir_name = "videos_2"
    dataset = MultiviewVideoDataset(
        args.dataset_dir,
        use_white_background=False,
        resolution=res,
        scale_x_angle=1.0,
        video_dir_name=video_dir_name,
    )

    test_dataset = MultiviewImageDataset(
        args.dataset_dir,
        use_white_background=False,
        resolution=res,
        # use_index=list(range(0, 30, 4)),
        # use_index=[0],
        scale_x_angle=1.0,
        fitler_with_renderd=True,
        load_imgs=False,
    )
    print("len of test dataset", len(test_dataset))
    return dataset, test_dataset


class Trainer:
    def __init__(self, args):
        self.args = args

        self.ssim = args.ssim
        args.warmup_step = int(args.warmup_step * args.gradient_accumulation_steps)
        args.train_iters = int(args.train_iters * args.gradient_accumulation_steps)
        os.environ["WANDB__SERVICE_WAIT"] = "600"
        args.wandb_name += (
            "decay_{}_substep_{}_{}_lr_{}_tv_{}_iters_{}_sw_{}_cw_{}".format(
                args.loss_decay,
                args.substep,
                args.model,
                args.lr,
                args.tv_loss_weight,
                args.train_iters,
                args.start_window_size,
                args.compute_window,
            )
        )

        logging_dir = os.path.join(args.output_dir, args.wandb_name)
        accelerator_project_config = ProjectConfiguration(logging_dir=logging_dir)
        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
        accelerator = Accelerator(
            gradient_accumulation_steps=1,  # args.gradient_accumulation_steps,
            mixed_precision="no",
            log_with="wandb",
            project_config=accelerator_project_config,
            kwargs_handlers=[ddp_kwargs],
        )
        self.gradient_accumulation_steps = args.gradient_accumulation_steps
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO,
        )
        logger.info(accelerator.state, main_process_only=False)

        set_seed(args.seed + accelerator.process_index)
        print("process index", accelerator.process_index)
        if accelerator.is_main_process:
            output_path = os.path.join(logging_dir, f"seed{args.seed}")
            os.makedirs(output_path, exist_ok=True)
            self.output_path = output_path

        self.rand_bg = args.rand_bg
        # setup the dataset
        dataset, test_dataset = create_dataset(args)
        self.test_dataset = test_dataset

        dataset_dir = test_dataset.data_dir
        self.dataset = dataset

        gaussian_path = os.path.join(dataset_dir, "point_cloud.ply")
        aabb = self.setup_eval(
            args,
            gaussian_path,
            white_background=True,
        )
        self.aabb = aabb

        self.num_frames = int(args.num_frames)
        self.window_size_schduler = LinearStepAnneal(
            args.train_iters,
            start_state=[args.start_window_size],
            end_state=[13],
            plateau_iters=0,
            warmup_step=300,
        )

        test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=args.batch_size,
            shuffle=False,
            drop_last=True,
            num_workers=0,
            collate_fn=camera_dataset_collate_fn_img,
        )
        # why prepare here again?
        test_dataloader = accelerator.prepare(test_dataloader)
        self.test_dataloader = cycle(test_dataloader)

        dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=args.batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=0,
            collate_fn=camera_dataset_collate_fn,
        )
        # why prepare here again?
        dataloader = accelerator.prepare(dataloader)
        self.dataloader = cycle(dataloader)

        self.train_iters = args.train_iters
        self.accelerator = accelerator
        # init traiable params
        E_nu_list = self.init_trainable_params()
        for p in E_nu_list:
            p.requires_grad = True
        self.E_nu_list = E_nu_list

        self.setup_simulation(dataset_dir, grid_size=args.grid_size)

        if args.checkpoint_path == "None":
            args.checkpoint_path = None
        if args.checkpoint_path is not None:
            self.load(args.checkpoint_path)
            trainable_params = list(self.sim_fields.parameters()) + self.E_nu_list
            optim_list = [
                {"params": self.E_nu_list, "lr": args.lr * 1e-10},
                {
                    "params": self.sim_fields.parameters(),
                    "lr": args.lr,
                    "weight_decay": 1e-4,
                },
                # {"params": self.velo_fields.parameters(), "lr": args.lr * 1e-3, "weight_decay": 1e-4},
            ]
            self.freeze_velo = True
            self.velo_optimizer = None
        else:
            trainable_params = list(self.sim_fields.parameters()) + self.E_nu_list
            optim_list = [
                {"params": self.E_nu_list, "lr": args.lr * 1e-10},
                {
                    "params": self.sim_fields.parameters(),
                    "lr": args.lr * 1e-10,
                    "weight_decay": 1e-4,
                },
            ]
            self.freeze_velo = False
            self.window_size_schduler.warmup_step = 800

            velo_optim = [
                {
                    "params": self.velo_fields.parameters(),
                    "lr": args.lr * 0.1,
                    "weight_decay": 1e-4,
                },
            ]
            self.velo_optimizer = torch.optim.AdamW(
                velo_optim,
                lr=args.lr,
                weight_decay=0.0,
            )
            self.velo_scheduler = get_linear_schedule_with_warmup(
                optimizer=self.velo_optimizer,
                num_warmup_steps=args.warmup_step,
                num_training_steps=args.train_iters,
            )
            self.velo_optimizer, self.velo_scheduler = accelerator.prepare(
                self.velo_optimizer, self.velo_scheduler
            )

        self.optimizer = torch.optim.AdamW(
            optim_list,
            lr=args.lr,
            weight_decay=0.0,
        )
        self.trainable_params = trainable_params
        self.scheduler = get_linear_schedule_with_warmup(
            optimizer=self.optimizer,
            num_warmup_steps=args.warmup_step,
            num_training_steps=args.train_iters,
        )
        self.sim_fields, self.optimizer, self.scheduler = accelerator.prepare(
            self.sim_fields, self.optimizer, self.scheduler
        )
        self.velo_fields = accelerator.prepare(self.velo_fields)

        # setup train info
        self.step = 0
        self.batch_size = args.batch_size
        self.tv_loss_weight = args.tv_loss_weight

        self.log_iters = args.log_iters
        self.wandb_iters = args.wandb_iters
        self.max_grad_norm = args.max_grad_norm

        self.use_wandb = args.use_wandb
        if self.accelerator.is_main_process:
            if args.use_wandb:
                run = wandb.init(
                    config=dict(args),
                    dir=self.output_path,
                    **{
                        "mode": "online",
                        "entity": args.wandb_entity,
                        "project": args.wandb_project,
                    },
                )
                wandb.run.log_code(".")
                wandb.run.name = args.wandb_name
                print(f"run dir: {run.dir}")
                self.wandb_folder = run.dir
                os.makedirs(self.wandb_folder, exist_ok=True)

    def init_trainable_params(
        self,
    ):

        # init young modulus and poisson ratio

        young_numpy = np.exp(np.random.uniform(np.log(1e-3), np.log(1e3))).astype(
            np.float32
        )
        young_numpy = 1e6 * 1.0

        young_modulus = torch.tensor(young_numpy, dtype=torch.float32).to(
            self.accelerator.device
        )

        poisson_numpy = np.random.uniform(0.1, 0.4)
        poisson_ratio = torch.tensor(poisson_numpy, dtype=torch.float32).to(
            self.accelerator.device
        )

        trainable_params = [young_modulus, poisson_ratio]

        print(
            "init young modulus: ",
            young_modulus.item(),
            "poisson ratio: ",
            poisson_ratio.item(),
        )
        return trainable_params

    def setup_simulation(self, dataset_dir, grid_size=100):

        device = "cuda:{}".format(self.accelerator.process_index)

        xyzs = self.render_params.gaussians.get_xyz.detach().clone()
        sim_xyzs = xyzs[self.sim_mask_in_raw_gaussian, :]
        sim_cov = (
            self.render_params.gaussians.get_covariance()[
                self.sim_mask_in_raw_gaussian, :
            ]
            .detach()
            .clone()
        )

        # scale, and shift
        pos_max = sim_xyzs.max()
        pos_min = sim_xyzs.min()
        scale = (pos_max - pos_min) * 1.8
        shift = -pos_min + (pos_max - pos_min) * 0.25
        self.scale, self.shift = scale, shift
        print("scale, shift", scale, shift)

        # filled
        filled_in_points_path = os.path.join(dataset_dir, "internal_filled_points.ply")

        if os.path.exists(filled_in_points_path):
            fill_xyzs = pcu.load_mesh_v(filled_in_points_path)  # [n, 3]
            fill_xyzs = fill_xyzs[
                np.random.choice(
                    fill_xyzs.shape[0], int(fill_xyzs.shape[0] * 0.25), replace=False
                )
            ]
            fill_xyzs = torch.from_numpy(fill_xyzs).float().to("cuda")
            self.fill_xyzs = fill_xyzs
            print(
                "loaded {} internal filled points from: ".format(fill_xyzs.shape[0]),
                filled_in_points_path,
            )
        else:
            self.fill_xyzs = None

        if self.fill_xyzs is not None:
            render_mask_in_sim_pts = torch.cat(
                [
                    torch.ones_like(sim_xyzs[:, 0]).bool(),
                    torch.zeros_like(fill_xyzs[:, 0]).bool(),
                ],
                dim=0,
            ).to(device)
            sim_xyzs = torch.cat([sim_xyzs, fill_xyzs], dim=0)
            sim_cov = torch.cat(
                [sim_cov, sim_cov.new_ones((fill_xyzs.shape[0], sim_cov.shape[-1]))],
                dim=0,
            )
            self.render_mask = render_mask_in_sim_pts
        else:
            self.render_mask = torch.ones_like(sim_xyzs[:, 0]).bool().to(device)

        sim_xyzs = (sim_xyzs + shift) / scale

        sim_aabb = torch.stack(
            [torch.min(sim_xyzs, dim=0)[0], torch.max(sim_xyzs, dim=0)[0]], dim=0
        )
        sim_aabb = (
            sim_aabb - torch.mean(sim_aabb, dim=0, keepdim=True)
        ) * 1.2 + torch.mean(sim_aabb, dim=0, keepdim=True)

        print("simulation aabb: ", sim_aabb)

        # point cloud resample with kmeans
        downsample_scale = self.args.downsample_scale
        num_cluster = int(sim_xyzs.shape[0] * downsample_scale)
        sim_xyzs = downsample_with_kmeans_gpu(sim_xyzs, num_cluster)

        sim_gaussian_pos = self.render_params.gaussians.get_xyz.detach().clone()[
            self.sim_mask_in_raw_gaussian, :
        ]
        sim_gaussian_pos = (sim_gaussian_pos + shift) / scale

        cdist = torch.cdist(sim_gaussian_pos, sim_xyzs) * -1.0
        _, top_k_index = torch.topk(cdist, self.args.top_k, dim=-1)
        self.top_k_index = top_k_index

        print("Downsampled to: ", sim_xyzs.shape[0], "by", downsample_scale)

        # compute volue for each point.
        points_volume = get_volume(sim_xyzs.detach().cpu().numpy())

        num_particles = sim_xyzs.shape[0]

        wp.init()
        wp.config.mode = "debug"
        wp.config.verify_cuda = True

        mpm_state = MPMStateStruct()
        mpm_state.init(num_particles, device=device, requires_grad=True)

        self.particle_init_position = sim_xyzs.clone()

        mpm_state.from_torch(
            self.particle_init_position.clone(),
            torch.from_numpy(points_volume).float().to(device).clone(),
            None,  # set cov to None, since it is not used.
            device=device,
            requires_grad=True,
            n_grid=grid_size,
            grid_lim=1.0,
        )
        mpm_model = MPMModelStruct()
        mpm_model.init(num_particles, device=device, requires_grad=True)
        mpm_model.init_other_params(n_grid=grid_size, grid_lim=1.0, device=device)

        material_params = {
            "material": "jelly",  # "jelly", "metal", "sand", "foam", "snow", "plasticine", "neo-hookean"
            "g": [0.0, 0.0, 0.0],
            "density": 2000,  # kg / m^3
            "grid_v_damping_scale": 1.1,  # 0.999,
        }

        self.v_damping = material_params["grid_v_damping_scale"]
        self.material_name = material_params["material"]
        mpm_solver = MPMWARPDiff(
            num_particles, n_grid=grid_size, grid_lim=1.0, device=device
        )
        mpm_solver.set_parameters_dict(mpm_model, mpm_state, material_params)

        self.mpm_state, self.mpm_model, self.mpm_solver = (
            mpm_state,
            mpm_model,
            mpm_solver,
        )

        # setup boundary condition:
        moving_pts_path = os.path.join(dataset_dir, "moving_part_points.ply")
        if os.path.exists(moving_pts_path):
            moving_pts = pcu.load_mesh_v(moving_pts_path)
            moving_pts = torch.from_numpy(moving_pts).float().to(device)
            moving_pts = (moving_pts + shift) / scale
            freeze_mask = find_far_points(
                sim_xyzs, moving_pts, thres=0.25 / grid_size
            ).bool()
            freeze_pts = sim_xyzs[freeze_mask, :]

            grid_freeze_mask = apply_grid_bc_w_freeze_pts(
                grid_size, 1.0, freeze_pts, mpm_solver
            )
            self.freeze_mask = freeze_mask

            # does not prefer boundary condition on particle
            # freeze_mask_select = setup_boundary_condition_with_points(sim_xyzs, moving_pts,
            #                                                         self.mpm_solver, self.mpm_state, thres=0.5 / grid_size)
            # self.freeze_mask = freeze_mask_select.bool()
        else:
            raise NotImplementedError

        num_freeze_pts = self.freeze_mask.sum()
        print(
            "num freeze pts in total",
            num_freeze_pts.item(),
            "num moving pts",
            num_particles - num_freeze_pts.item(),
        )

        # init fields for simulation, e.g. density, external force, etc.

        # padd init density, youngs,
        density = (
            torch.ones_like(self.particle_init_position[..., 0])
            * material_params["density"]
        )
        youngs_modulus = (
            torch.ones_like(self.particle_init_position[..., 0])
            * self.E_nu_list[0].detach()
        )
        poisson_ratio = torch.ones_like(self.particle_init_position[..., 0]) * 0.3

        # load stem for higher density
        stem_pts_path = os.path.join(dataset_dir, "stem_points.ply")
        if os.path.exists(stem_pts_path):
            stem_pts = pcu.load_mesh_v(stem_pts_path)
            stem_pts = torch.from_numpy(stem_pts).float().to(device)
            stem_pts = (stem_pts + shift) / scale
            no_stem_mask = find_far_points(
                sim_xyzs, stem_pts, thres=2.0 / grid_size
            ).bool()
            stem_mask = torch.logical_not(no_stem_mask)
            density[stem_mask] = 2000
            print("num stem pts", stem_mask.sum().item())

        self.density = density
        self.young_modulus = youngs_modulus
        self.poisson_ratio = poisson_ratio

        # set density, youngs, poisson
        mpm_state.reset_density(
            density.clone(),
            torch.ones_like(density).type(torch.int),
            device,
            update_mass=True,
        )
        mpm_solver.set_E_nu_from_torch(
            mpm_model, youngs_modulus.clone(), poisson_ratio.clone(), device
        )
        mpm_solver.prepare_mu_lam(mpm_model, mpm_state, device)

        self.sim_fields = create_spatial_fields(self.args, 1, sim_aabb)
        self.sim_fields.train()

        self.args.sim_res = 24
        # self.velo_fields = create_velocity_model(self.args, sim_aabb)
        self.velo_fields = create_spatial_fields(
            self.args, 3, sim_aabb, add_entropy=False
        )
        self.velo_fields.train()

    def set_simulation_state(
        self,
        init_xyzs,
        init_velocity,
        device,
        requires_grad=False,
        use_precompute_F=False,
        use_density=True,
    ):

        initial_position_time0 = self.particle_init_position.clone()

        if use_precompute_F:
            self.mpm_state.reset_state(
                initial_position_time0,
                None,
                init_velocity.clone(),
                device=device,
                requires_grad=True,
            )

            init_xyzs_wp = from_torch_safe(
                init_xyzs.clone().detach().contiguous(),
                dtype=wp.vec3,
                requires_grad=True,
            )
            self.mpm_solver.restart_and_compute_F_C(
                self.mpm_model, self.mpm_state, init_xyzs_wp, device=device
            )
        else:
            self.mpm_state.reset_state(
                init_xyzs.clone(),
                None,
                init_velocity.clone(),
                device=device,
                requires_grad=True,
            )

    def get_density_velocity(self, time_stamp: float, device, requires_grad=True):

        initial_position_time0 = self.particle_init_position.clone()

        query_mask = torch.logical_not(self.freeze_mask)
        query_pts = initial_position_time0[query_mask, :]
        sim_params = self.sim_fields(query_pts)
        # density = sim_params[..., 0]

        # 0.1
        young_modulus = sim_params[..., 0]
        # young_modulus = torch.exp(sim_params[..., 0]) + init_young
        young_modulus = torch.clamp(young_modulus, 1e-3, 1e8)

        # young_padded = torch.ones_like(initial_position_time0[..., 0]) * init_young
        young_padded = self.young_modulus.detach().clone()
        young_padded[query_mask] = young_padded[query_mask] + young_modulus * 1

        density = self.density.detach().clone()

        velocity = self.velo_fields(query_pts)[..., :3]

        # scaling.
        velocity = velocity * 0.1

        return density, young_padded, velocity, query_mask

    def train_one_step(self):

        self.sim_fields.train()
        self.velo_fields.train()
        accelerator = self.accelerator
        device = "cuda:{}".format(accelerator.process_index)
        data = next(self.dataloader)
        cam = data["cam"][0]

        time_stamps = np.linspace(0, 1, self.num_frames).astype(np.float32)[1:]

        gt_videos = data["video_clip"][0, 1 : self.num_frames, ...]

        window_size = int(self.window_size_schduler.compute_state(self.step)[0])
        print("window size", window_size)
        stop_velo_opt_thres = 4
        do_velo_opt = not self.freeze_velo
        if not do_velo_opt:
            stop_velo_opt_thres = (
                0  # stop velocity optimization if we are loading from checkpoint
            )
        if window_size >= stop_velo_opt_thres:
            self.velo_fields.eval()
            do_velo_opt = False

        rendered_video_list = []
        log_loss_dict = {
            "loss": [],
            "l2_loss": [],
            "psnr": [],
            "ssim": [],
        }

        init_xyzs = self.particle_init_position.clone()
        num_particles = init_xyzs.shape[0]
        # delta_time = 1.0 / (self.num_frames - 1)
        delta_time = 1.0 / 30
        substep_size = delta_time / self.args.substep
        num_substeps = int(delta_time / substep_size)

        start_time_idx = max(0, window_size - self.args.compute_window)
        for time_idx in range(start_time_idx, window_size):
            # time_stamp = time_stamps[time_idx]
            time_stamp = time_stamps[0]  # fix to begining.. Start at the begining

            density, youngs_padded, init_velocity, query_mask = (
                self.get_density_velocity(time_stamp, device)
            )

            if not do_velo_opt:
                init_velocity = init_velocity.detach()
            padded_velocity = torch.zeros_like(init_xyzs)
            padded_velocity[query_mask, :] = init_velocity

            gt_frame = gt_videos[[time_idx]]

            extra_no_grad_step = max(
                0, (time_idx - self.args.grad_window + 1) * num_substeps
            )
            if do_velo_opt:
                extra_no_grad_step = 0

            num_step_with_grad = num_substeps * (time_idx + 1) - extra_no_grad_step

            particle_pos = MPMDifferentiableSimulation.apply(
                self.mpm_solver,
                self.mpm_state,
                self.mpm_model,
                0,
                substep_size,
                num_step_with_grad,
                init_xyzs,
                padded_velocity,
                youngs_padded,
                self.E_nu_list[1],
                density,
                query_mask,
                None,
                device,
                True,
                extra_no_grad_step,
            )

            gaussian_pos = particle_pos * self.scale - self.shift
            undeformed_gaussian_pos = (
                self.particle_init_position * self.scale - self.shift
            )
            disp_offset = gaussian_pos - undeformed_gaussian_pos.detach()
            # gaussian_pos.requires_grad = True

            simulated_video = render_gaussian_seq_w_mask_with_disp(
                cam,
                self.render_params,
                undeformed_gaussian_pos.detach(),
                self.top_k_index,
                [disp_offset],
                self.sim_mask_in_raw_gaussian,
            )

            # print("debug", simulated_video.shape, gt_frame.shape, gaussian_pos.shape, init_xyzs.shape, density.shape, query_mask.sum().item())
            rendered_video_list.append(simulated_video.detach())

            l2_loss = 0.5 * F.mse_loss(simulated_video, gt_frame, reduction="mean")
            ssim_loss = compute_ssim(simulated_video, gt_frame)
            loss = l2_loss * (1.0 - self.ssim) + (1.0 - ssim_loss) * self.ssim

            sm_velo_loss = self.velo_fields.compute_smoothess_loss()
            if time_idx > 2 or window_size > stop_velo_opt_thres:
                sm_velo_loss = sm_velo_loss.detach()
            sm_spatial_loss = self.sim_fields.compute_smoothess_loss()

            sm_loss = sm_velo_loss + sm_spatial_loss
            loss = (
                loss * (self.args.loss_decay**time_idx) + sm_loss * self.tv_loss_weight
            )
            loss = loss / self.args.compute_window
            loss.backward()

            with torch.no_grad():
                psnr = compute_psnr(simulated_video, gt_frame).mean()
                log_loss_dict["loss"].append(loss.item())
                log_loss_dict["l2_loss"].append(l2_loss.item())
                log_loss_dict["psnr"].append(psnr.item())
                log_loss_dict["ssim"].append(ssim_loss.item())

            # subtep-4: pass gradients to mpm solver

        nu_grad_norm = self.E_nu_list[1].grad.norm(2).item()
        spatial_grad_norm = 0
        for p in self.sim_fields.parameters():
            if p.grad is not None:
                spatial_grad_norm += p.grad.norm(2).item()
        velo_grad_norm = 0
        for p in self.velo_fields.parameters():
            if p.grad is not None:
                velo_grad_norm += p.grad.norm(2).item()

        renderd_video = torch.cat(rendered_video_list, dim=0)
        renderd_video = torch.clamp(renderd_video, 0.0, 1.0)
        visual_video = (renderd_video.detach().cpu().numpy() * 255.0).astype(np.uint8)
        gt_video = (gt_videos.detach().cpu().numpy() * 255.0).astype(np.uint8)

        if (
            self.step % self.gradient_accumulation_steps == 0
            or self.step == (self.train_iters - 1)
            or (self.step % self.log_iters == self.log_iters - 1)
        ):

            torch.nn.utils.clip_grad_norm_(
                self.trainable_params,
                self.max_grad_norm,
                error_if_nonfinite=False,
            )  # error if nonfinite is false

            self.optimizer.step()
            self.optimizer.zero_grad()
            if do_velo_opt:
                assert self.velo_optimizer is not None
                torch.nn.utils.clip_grad_norm_(
                    self.velo_fields.parameters(),
                    self.max_grad_norm * 10,
                    error_if_nonfinite=False,
                )  # error if nonfinite is false
                self.velo_optimizer.step()
                self.velo_optimizer.zero_grad()
                self.velo_scheduler.step()
            with torch.no_grad():
                self.E_nu_list[0].data.clamp_(1e-3, 2000)
                self.E_nu_list[1].data.clamp_(1e-2, 0.449)
        self.scheduler.step()

        for k, v in log_loss_dict.items():
            log_loss_dict[k] = np.mean(v)

        print(log_loss_dict)
        print(
            "nu: ",
            self.E_nu_list[1].item(),
            nu_grad_norm,
            spatial_grad_norm,
            velo_grad_norm,
            "young_mean, max:",
            youngs_padded.mean().item(),
            youngs_padded.max().item(),
            do_velo_opt,
        )

        if accelerator.is_main_process and (self.step % self.wandb_iters == 0):
            with torch.no_grad():
                wandb_dict = {
                    "nu_grad_norm": nu_grad_norm,
                    "spatial_grad_norm": spatial_grad_norm,
                    "velo_grad_norm": velo_grad_norm,
                    "nu": self.E_nu_list[1].item(),
                    # "mean_density": density.mean().item(),
                    "mean_E": youngs_padded.mean().item(),
                    "max_E": youngs_padded.max().item(),
                    "min_E": youngs_padded.min().item(),
                    "smoothness_loss": sm_loss.item(),
                    "window_size": window_size,
                    "velo_mean": init_velocity.mean().item(),
                    "velo_max": init_velocity.max().item(),
                }

                simulated_video = self.inference(cam)
                sim_video_torch = (
                    torch.from_numpy(simulated_video).float().to(device) / 255.0
                )
                gt_video_torch = torch.from_numpy(gt_video).float().to(device) / 255.0

                full_psnr = compute_psnr(sim_video_torch[1:], gt_video_torch)

                first_psnr = full_psnr[:6].mean().item()
                last_psnr = full_psnr[-6:].mean().item()
                full_psnr = full_psnr.mean().item()
                wandb_dict["full_psnr"] = full_psnr
                wandb_dict["first_psnr"] = first_psnr
                wandb_dict["last_psnr"] = last_psnr
                wandb_dict.update(log_loss_dict)

                if self.step % int(5 * self.wandb_iters) == 0:

                    wandb_dict["rendered_video"] = wandb.Video(
                        visual_video, fps=visual_video.shape[0]
                    )

                    wandb_dict["gt_video"] = wandb.Video(
                        gt_video,
                        fps=gt_video.shape[0],
                    )

                    wandb_dict["inference_video"] = wandb.Video(
                        simulated_video,
                        fps=simulated_video.shape[0],
                    )

                    simulated_video = self.inference(
                        cam, num_sec=3, substep=self.args.substep
                    )
                    wandb_dict["inference_video_t3"] = wandb.Video(
                        simulated_video,
                        fps=simulated_video.shape[0] // 3,
                    )

                    simulated_video = self.inference(
                        cam, velo_scaling=5.0, num_sec=3, substep=self.args.substep
                    )
                    wandb_dict["inference_video_v5_t3"] = wandb.Video(
                        simulated_video,
                        fps=simulated_video.shape[0] // 3,
                    )

                if self.use_wandb:
                    wandb.log(wandb_dict, step=self.step)

        self.accelerator.wait_for_everyone()

    def train(self):
        # might remove tqdm when multiple node
        for index in tqdm(range(self.step, self.train_iters), desc="Training progress"):
            self.train_one_step()
            if self.step % self.log_iters == self.log_iters - 1:
                if self.accelerator.is_main_process:
                    self.save()
                    # self.test()
            # self.accelerator.wait_for_everyone()
            self.step += 1
        if self.accelerator.is_main_process:
            self.save()

    @torch.no_grad()
    def inference(
        self, cam, velo_scaling=1.0, num_sec=1, nu=None, young_scaling=1.0, substep=20
    ):

        self.sim_fields.eval()
        self.velo_fields.eval()

        device = "cuda:{}".format(self.accelerator.process_index)

        time_stamps = np.linspace(0, 1, self.num_frames).astype(np.float32)[1:]
        time_idx = 0
        time_stamp = time_stamps[time_idx]

        density, youngs_padded, init_velocity, query_mask = self.get_density_velocity(
            time_stamp, device
        )
        youngs_padded = youngs_padded * young_scaling
        init_xyzs = self.particle_init_position

        padded_velocity = torch.zeros_like(init_xyzs)
        padded_velocity[query_mask, :] = init_velocity * velo_scaling

        num_particles = init_xyzs.shape[0]

        delta_time = 1.0 / (self.num_frames - 1)
        delta_time = 1.0 / 30
        substep_size = delta_time / substep
        num_substeps = int(delta_time / substep_size)
        # reset state
        self.set_simulation_state(
            init_xyzs,
            padded_velocity,
            device,
            requires_grad=True,
            use_precompute_F=False,
            use_density=False,
        )

        if nu is None:
            E, nu = self.E_nu_list[0].item(), self.E_nu_list[1].item()
        E_wp = from_torch_safe(youngs_padded, dtype=wp.float32, requires_grad=False)
        self.mpm_solver.set_E_nu(self.mpm_model, E_wp, nu, device=device)
        self.mpm_solver.prepare_mu_lam(self.mpm_model, self.mpm_state, device=device)

        wp.launch(
            kernel=get_float_array_product,
            dim=num_particles,
            inputs=[
                self.mpm_state.particle_density,
                self.mpm_state.particle_vol,
                self.mpm_state.particle_mass,
            ],
            device=device,
        )

        pos_list = [self.particle_init_position.clone() * self.scale - self.shift]

        for i in tqdm(range((self.num_frames - 1) * num_sec)):
            for substep in range(num_substeps):
                self.mpm_solver.p2g2p(
                    self.mpm_model,
                    self.mpm_state,
                    substep,
                    substep_size,
                    device="cuda:0",
                )

            pos = wp.to_torch(self.mpm_state.particle_x).clone()
            pos = (pos * self.scale) - self.shift
            pos_list.append(pos)

        init_pos = pos_list[0].clone()
        pos_diff_list = [_ - init_pos for _ in pos_list]

        video_array = render_gaussian_seq_w_mask_with_disp(
            cam,
            self.render_params,
            init_pos,
            self.top_k_index,
            pos_diff_list,
            self.sim_mask_in_raw_gaussian,
        )

        video_numpy = video_array.detach().cpu().numpy() * 255
        video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)

        return video_numpy

    def save(
        self,
    ):
        # training states
        output_path = os.path.join(
            self.output_path, f"checkpoint_model_{self.step:06d}"
        )
        os.makedirs(output_path, exist_ok=True)

        name_list = [
            "velo_fields",
            "sim_fields",
        ]
        for i, model in enumerate(
            [
                self.accelerator.unwrap_model(self.velo_fields, keep_fp32_wrapper=True),
                self.accelerator.unwrap_model(self.sim_fields, keep_fp32_wrapper=True),
            ]
        ):
            model_name = name_list[i]
            model_path = os.path.join(output_path, model_name + ".pt")
            torch.save(model.state_dict(), model_path)

    def load(self, checkpoint_dir):
        name_list = [
            "velo_fields",
            "sim_fields",
        ]
        for i, model in enumerate([self.velo_fields, self.sim_fields]):
            model_name = name_list[i]
            if model_name == "sim_fields" and (not self.args.run_eval):
                continue
            model_path = os.path.join(checkpoint_dir, model_name + ".pt")
            model.load_state_dict(torch.load(model_path))
            print("=> loaded: ", model_path)

    def setup_eval(self, args, gaussian_path, white_background=True):
        # setup gaussians
        class RenderPipe(NamedTuple):
            convert_SHs_python = False
            compute_cov3D_python = False
            debug = False

        class RenderParams(NamedTuple):
            render_pipe: RenderPipe
            bg_color: bool
            gaussians: GaussianModel
            camera_list: list

        gaussians = GaussianModel(3)
        camera_list = self.dataset.test_camera_list

        gaussians.load_ply(gaussian_path)
        gaussians.detach_grad()
        print(
            "load gaussians from: {}".format(gaussian_path),
            "... num gaussians: ",
            gaussians._xyz.shape[0],
        )
        bg_color = [1, 1, 1] if white_background else [0, 0, 0]
        background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
        render_pipe = RenderPipe()

        render_params = RenderParams(
            render_pipe=render_pipe,
            bg_color=background,
            gaussians=gaussians,
            camera_list=camera_list,
        )
        self.render_params = render_params

        # get_gaussian scene box
        scaler = 1.1
        points = gaussians._xyz

        min_xyz = torch.min(points, dim=0)[0]
        max_xyz = torch.max(points, dim=0)[0]

        center = (min_xyz + max_xyz) / 2

        scaled_min_xyz = (min_xyz - center) * scaler + center
        scaled_max_xyz = (max_xyz - center) * scaler + center

        aabb = torch.stack([scaled_min_xyz, scaled_max_xyz], dim=0)

        # add filled in points
        gaussian_dir = os.path.dirname(gaussian_path)

        clean_points_path = os.path.join(gaussian_dir, "clean_object_points.ply")
        if os.path.exists(clean_points_path):
            clean_xyzs = pcu.load_mesh_v(clean_points_path)
            clean_xyzs = torch.from_numpy(clean_xyzs).float().to("cuda")
            self.clean_xyzs = clean_xyzs
            print(
                "loaded {} clean points from: ".format(clean_xyzs.shape[0]),
                clean_points_path,
            )
            # we can use tight threshold here
            not_sim_maks = find_far_points(
                gaussians._xyz, clean_xyzs, thres=0.01
            ).bool()
            sim_mask_in_raw_gaussian = torch.logical_not(not_sim_maks)
            # [N]
            self.sim_mask_in_raw_gaussian = sim_mask_in_raw_gaussian
        else:
            self.clean_xyzs = None
            self.sim_mask_in_raw_gaussian = torch.ones_like(gaussians._xyz[:, 0]).bool()

        return aabb

    def eval(
        self,
    ):

        accelerator = self.accelerator
        device = "cuda:{}".format(accelerator.process_index)
        data = next(self.dataloader)
        cam = data["cam"][0]

        nu = 0.1
        young_scaling = 5000.0
        substep = 800  # 1e-4
        video_numpy = self.inference(
            cam,
            velo_scaling=5.0,
            num_sec=3,
            nu=nu,
            young_scaling=young_scaling,
            substep=substep,
        )

        video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])
        from motionrep.utils.io_utils import save_video_imageio, save_gif_imageio

        # output_dir = os.path.join(self.output_path, "simulation")
        output_dir = "./"

        save_path = os.path.join(
            output_dir,
            "eval_fill2k_video_nu_{}_ys_{}_substep_{}_grid_{}".format(
                nu, young_scaling, substep, self.args.grid_size
            )
            + ".gif",
        )
        print("save video to ", save_path)
        # save_video_imageio(save_path, video_numpy, fps=12)
        save_gif_imageio(save_path, video_numpy, fps=12)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default="config.yml")

    # dataset params
    parser.add_argument(
        "--dataset_dir",
        type=str,
        default="../../data/physics_dreamer/alocasia_nerfstudio",
    )
    parser.add_argument(
        "--dataset_res",
        type=str,
        default="large",  # ["middle", "small", "large"]
    )

    parser.add_argument("--model", type=str, default="se3_field")
    parser.add_argument("--feat_dim", type=int, default=64)
    parser.add_argument("--num_decoder_layers", type=int, default=3)
    parser.add_argument("--decoder_hidden_size", type=int, default=64)
    parser.add_argument("--spatial_res", type=int, default=32)
    parser.add_argument("--zero_init", type=bool, default=True)
    parser.add_argument("--entropy_cls", type=int, default=0)

    parser.add_argument("--num_frames", type=str, default=14)

    parser.add_argument("--grid_size", type=int, default=32)
    parser.add_argument("--sim_res", type=int, default=24)
    parser.add_argument("--sim_output_dim", type=int, default=1)
    parser.add_argument("--substep", type=int, default=96)
    parser.add_argument("--loss_decay", type=float, default=1.0)
    parser.add_argument("--start_window_size", type=int, default=2)
    parser.add_argument("--compute_window", type=int, default=2)
    parser.add_argument("--grad_window", type=int, default=14)

    parser.add_argument("--downsample_scale", type=float, default=0.1)
    parser.add_argument("--top_k", type=int, default=8)

    # loss parameters
    parser.add_argument("--tv_loss_weight", type=float, default=1e-2)
    parser.add_argument("--ssim", type=float, default=0.5)

    # Logging and checkpointing
    parser.add_argument("--output_dir", type=str, default="../../output/inverse_sim")
    parser.add_argument("--log_iters", type=int, default=100)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument(
        "--checkpoint_path", type=str, default=None, help="path to load checkpoint from"
    )
    # training parameters
    parser.add_argument("--train_iters", type=int, default=300)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--lr", type=float, default=1e-2)
    parser.add_argument("--max_grad_norm", type=float, default=1.0)
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
    )

    # wandb parameters
    parser.add_argument("--use_wandb", action="store_true", default=False)
    parser.add_argument("--wandb_entity", type=str, default="mit-cv")
    parser.add_argument("--wandb_project", type=str, default="inverse_sim")
    parser.add_argument("--wandb_iters", type=int, default=20)
    parser.add_argument("--wandb_name", type=str, required=True)
    parser.add_argument("--run_eval", action="store_true", default=False)

    # distributed training args
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="For distributed training: local_rank",
    )

    args, extra_args = parser.parse_known_args()
    cfg = create_config(args.config, args, extra_args)

    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank
    print(args.local_rank, "local rank")

    return cfg


if __name__ == "__main__":
    args = parse_args()

    # torch.backends.cuda.matmul.allow_tf32 = True

    trainer = Trainer(args)

    if args.run_eval:
        trainer.eval()
    else:
        # trainer.debug()
        trainer.train()


================================================
FILE: projects/uncleaned_train/exp_motion/train/interface.py
================================================
from typing import Optional, Tuple
from jaxtyping import Float, Int, Shaped
import torch
import torch.autograd as autograd
import torch.nn as nn
from torch import Tensor

import warp as wp

from thirdparty_code.warp_mpm.warp_utils import from_torch_safe, MyTape, CondTape
from thirdparty_code.warp_mpm.mpm_solver_diff import MPMWARPDiff
from thirdparty_code.warp_mpm.mpm_utils import compute_position_l2_loss, aggregate_grad, compute_posloss_with_grad
from thirdparty_code.warp_mpm.mpm_data_structure import MPMStateStruct, MPMModelStruct, get_float_array_product
from thirdparty_code.warp_mpm.mpm_utils import (compute_Closs_with_grad, compute_Floss_with_grad, 
                                                compute_posloss_with_grad, compute_veloloss_with_grad)


class MPMDifferentiableSimulation(autograd.Function):

    @staticmethod
    def forward(
        ctx: autograd.function.FunctionCtx,
        mpm_solver: MPMWARPDiff,
        mpm_state: MPMStateStruct,
        mpm_model: MPMModelStruct,
        substep: int, 
        substep_size: float, 
        num_substeps: int,
        init_pos: Float[Tensor, "n 3"],
        init_velocity: Float[Tensor, "n 3"],
        E: Float[Tensor, "n"] | Float[Tensor, "1"],
        nu: Float[Tensor, "n"] | Float[Tensor, "1"],
        particle_density: Optional[Float[Tensor, "n"] | Float[Tensor, "1"]]=None,
        density_change_mask: Optional[Int[Tensor, "n"]] = None,
        static_pos: Optional[Float[Tensor, "n 3"]] = None,
        device: str="cuda:0",
        requires_grad: bool=True,
        extra_no_grad_steps: int=0,
    ) -> Float[Tensor, "n 3"]:
        """
        Args:
            density_change_mask: [n] 0 or 1.  1 means the density of this particle can change.
        """
        
        num_particles = init_pos.shape[0]
        if static_pos is None:
            
            mpm_state.reset_state(
                init_pos.clone(),
                None,
                init_velocity, #.clone(),
                device=device,
                requires_grad=requires_grad,
            )
        else:
            mpm_state.reset_state(
                static_pos.clone(),
                None,
                init_velocity, #.clone(),
                device=device,
                requires_grad=requires_grad,

            )
            init_xyzs_wp = from_torch_safe(init_pos.clone().detach().contiguous(), dtype=wp.vec3, requires_grad=requires_grad)
            mpm_solver.restart_and_compute_F_C(mpm_model, mpm_state, init_xyzs_wp, device=device)
        
        if E.ndim == 0:
            E_inp = E.item() # float
            ctx.aggregating_E = True
        else:
            E_inp = from_torch_safe(E, dtype=wp.float32, requires_grad=requires_grad)
            ctx.aggregating_E = False
        if nu.ndim == 0:
            nu_inp = nu.item() # float
            ctx.aggregating_nu = True
        else:
            nu_inp = from_torch_safe(nu, dtype=wp.float32, requires_grad=requires_grad)
            ctx.aggregating_nu = False
            
        mpm_solver.set_E_nu(mpm_model, E_inp, nu_inp, device=device)

        mpm_state.reset_density(
            tensor_density=particle_density,
            selection_mask=density_change_mask,
            device=device,
            requires_grad=requires_grad)
        
        prev_state = mpm_state

        if extra_no_grad_steps > 0:
            with torch.no_grad():
                wp.launch(
                    kernel=get_float_array_product,
                    dim=num_particles,
                    inputs=[
                        mpm_state.particle_density,
                        mpm_state.particle_vol,
                        mpm_state.particle_mass,
                    ],
                    device=device,
                )
                mpm_solver.prepare_mu_lam(mpm_model, mpm_state, device=device)

                for i in range(extra_no_grad_steps):
                    next_state = prev_state.partial_clone(requires_grad=requires_grad)
                    mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)
                    prev_state = next_state
        else:
            prev_state = mpm_state

        wp_tape = MyTape()
        cond_tape: CondTape = CondTape(wp_tape, requires_grad)

        next_state_list = [] 
        
        with cond_tape:
            wp.launch(
                kernel=get_float_array_product,
                dim=num_particles,
                inputs=[
                    prev_state.particle_density,
                    prev_state.particle_vol,
                    prev_state.particle_mass,
                ],
                device=device,
            )
            mpm_solver.prepare_mu_lam(mpm_model, prev_state, device=device)

            for substep_local in range(num_substeps):
                next_state = prev_state.partial_clone(requires_grad=requires_grad)
                mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)

                # next_state = mpm_solver.p2g2p_differentiable(mpm_model, prev_state, substep_size, device=device)
                next_state_list.append(next_state)
                prev_state = next_state
        
        ctx.mpm_solver = mpm_solver
        ctx.mpm_state = mpm_state
        ctx.mpm_model = mpm_model
        ctx.tape = cond_tape.tape
        ctx.device = device
        ctx.num_particles = num_particles

        ctx.next_state_list = next_state_list

        ctx.save_for_backward(density_change_mask)

        last_state = next_state_list[-1]
        particle_pos = wp.to_torch(last_state.particle_x).detach().clone()

        return particle_pos
    

    @staticmethod
    def backward(ctx, out_pos_grad: Float[Tensor, "n 3"]):
        
        num_particles = ctx.num_particles
        tape, device = ctx.tape, ctx.device
        mpm_solver, mpm_state, mpm_model = ctx.mpm_solver, ctx.mpm_state, ctx.mpm_model
        last_state = ctx.next_state_list[-1]
        density_change_mask = ctx.saved_tensors[0]

        grad_pos_wp = from_torch_safe(out_pos_grad, dtype=wp.vec3, requires_grad=False)
        target_pos_detach = wp.clone(last_state.particle_x, device=device, requires_grad=False)

        with tape:
            loss_wp = torch.zeros(1, device=device)
            loss_wp = wp.from_torch(loss_wp, requires_grad=True)
            wp.launch(
                compute_posloss_with_grad, 
                dim=num_particles,
                inputs=[
                    last_state,
                    target_pos_detach,
                    grad_pos_wp,
                    0.5,
                    loss_wp,
                ],
                device=device,
            )

        tape.backward(loss_wp)

        pos_grad = None
        if mpm_state.particle_v.grad is None:
            velo_grad = None
        else:
            velo_grad = wp.to_torch(mpm_state.particle_v.grad).detach().clone()

        # print("debug back", velo_grad)

        # grad for E, nu. TODO: add spatially varying E, nu later
        if ctx.aggregating_E:
            E_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
            wp.launch(
                aggregate_grad,
                dim=num_particles,
                inputs=[
                    E_grad,
                    mpm_model.E.grad,
                ],
                device=device,
            )
            E_grad = wp.to_torch(E_grad)[0] / num_particles
        else:
            E_grad = wp.to_torch(mpm_model.E.grad).detach().clone()

        if ctx.aggregating_nu:
            nu_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
            wp.launch(
                aggregate_grad,
                dim=num_particles,
                inputs=[nu_grad, mpm_model.nu.grad],
                device=device,
            )
            nu_grad = wp.to_torch(nu_grad)[0] / num_particles   
        else:
            nu_grad = wp.to_torch(mpm_model.nu.grad).detach().clone()

        # grad for density
        if mpm_state.particle_density.grad is None:
            density_grad = None
        else:
            density_grad = wp.to_torch(mpm_state.particle_density.grad).detach()
            density_grad = density_grad[density_change_mask.type(torch.bool)]
        
        density_mask_grad = None
        static_pos_grad = None 

        # from IPython import embed; embed()
        tape.zero()
        # print(density_grad.abs().sum(), velo_grad.abs().sum(), E_grad.abs().item(), nu_grad.abs().item(), "in sim func")

        return (None, None, None, None, None, None, 
                pos_grad, velo_grad, E_grad, nu_grad,
                 density_grad, density_mask_grad, 
                 static_pos_grad, None, None, None)
    

class MPMDifferentiableSimulationWCheckpoint(autograd.Function):
    """
    Current version does not support grad for density. 
    Please set vol, mass before calling this function.
    """

    @staticmethod
    @torch.no_grad()
    def forward(
        ctx: autograd.function.FunctionCtx,
        mpm_solver: MPMWARPDiff,
        mpm_state: MPMStateStruct,
        mpm_model: MPMModelStruct,
        substep_size: float, 
        num_substeps: int,
        particle_x: Float[Tensor, "n 3"], 
        particle_v: Float[Tensor, "n 3"],
        particle_F: Float[Tensor, "n 3 3"],
        particle_C: Float[Tensor, "n 3 3"],
        E: Float[Tensor, "n"] | Float[Tensor, "1"],
        nu: Float[Tensor, "n"] | Float[Tensor, "1"],
        particle_density: Optional[Float[Tensor, "n"] | Float[Tensor, "1"]]=None,
        query_mask: Optional[Int[Tensor, "n"]] = None,
        device: str="cuda:0",
        requires_grad: bool=True,
        extra_no_grad_steps: int=0,
    ) -> Tuple[Float[Tensor, "n 3"], Float[Tensor, "n 3"], Float[Tensor, "n 9"], Float[Tensor, "n 9"]]:
        """
        Args:
            query_mask: [n] 0 or 1.  1 means the density or young's modulus, or poisson'ratio of this particle can change.
        """
        
        # initialization work is done before calling forward! 

        num_particles = particle_x.shape[0]

        mpm_state.continue_from_torch(
            particle_x, particle_v, particle_F, particle_C, device=device, requires_grad=True
        )
        # set x, v, F, C.

        if E.ndim == 0:
            E_inp = E.item() # float
            ctx.aggregating_E = True
        else:
            E_inp = from_torch_safe(E, dtype=wp.float32, requires_grad=True)
            ctx.aggregating_E = False
        if nu.ndim == 0:
            nu_inp = nu.item() # float
            ctx.aggregating_nu = True
        else:
            nu_inp = from_torch_safe(nu, dtype=wp.float32, requires_grad=True)
            ctx.aggregating_nu = False
            
        mpm_solver.set_E_nu(mpm_model, E_inp, nu_inp, device=device)
        mpm_solver.prepare_mu_lam(mpm_model, mpm_state, device=device)

        mpm_state.reset_density(
            tensor_density=particle_density,
            selection_mask=query_mask,
            device=device,
            requires_grad=True,
            update_mass=True)
        
        prev_state = mpm_state

        if extra_no_grad_steps > 0:
            with torch.no_grad():
                for i in range(extra_no_grad_steps):
                    next_state = prev_state.partial_clone(requires_grad=True)
                    mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)
                    prev_state = next_state

        # following steps will be checkpointed. then replayed in backward
        ctx.prev_state = prev_state
        
        for substep_local in range(num_substeps):
            next_state = prev_state.partial_clone(requires_grad=True)
            mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)
            prev_state = next_state
        
        
        ctx.mpm_solver = mpm_solver
        ctx.mpm_state = mpm_state # state at the begining of this function; TODO: drop it?
        ctx.mpm_model = mpm_model
        ctx.device = device
        ctx.num_particles = num_particles

        ctx.num_substeps = num_substeps
        ctx.substep_size = substep_size
        
        ctx.save_for_backward(E, nu, particle_density, query_mask)

        last_state = next_state
        particle_pos = wp.to_torch(last_state.particle_x).detach().clone()
        particle_velo = wp.to_torch(last_state.particle_v).detach().clone()
        particle_F = wp.to_torch(last_state.particle_F_trial).detach().clone()
        particle_C = wp.to_torch(last_state.particle_C).detach().clone()

        return particle_pos, particle_velo, particle_F, particle_C
    

    @staticmethod
    def backward(ctx, out_pos_grad: Float[Tensor, "n 3"], out_velo_grad: Float[Tensor, "n 3"], 
                 out_F_grad: Float[Tensor, "n 9"], out_C_grad: Float[Tensor, "n 9"]):
        
        num_particles = ctx.num_particles
        device = ctx.device
        mpm_solver, mpm_model = ctx.mpm_solver, ctx.mpm_model
        prev_state = ctx.prev_state
        starting_state = ctx.prev_state 

        E, nu, particle_density, query_mask = ctx.saved_tensors

        num_substeps, substep_size = ctx.num_substeps, ctx.substep_size

        # rolling back
        # setting initial param first: 
        if E.ndim == 0:
            E_inp = E.item() # float
            ctx.aggregating_E = True
        else:
            E_inp = from_torch_safe(E, dtype=wp.float32, requires_grad=True)
            ctx.aggregating_E = False
        if nu.ndim == 0:
            nu_inp = nu.item() # float
            ctx.aggregating_nu = True
        else:
            nu_inp = from_torch_safe(nu, dtype=wp.float32, requires_grad=True)
            ctx.aggregating_nu = False
            
        mpm_solver.set_E_nu(mpm_model, E_inp, nu_inp, device=device)

        starting_state.reset_density(
            tensor_density=particle_density,
            selection_mask=query_mask,
            device=device,
            requires_grad=True)
        
        next_state_list = []

        with wp.ScopedDevice(device):
            tape = MyTape()

            # handle it later
            grad_pos_wp = from_torch_safe(out_pos_grad, dtype=wp.vec3, requires_grad=False)
            if out_velo_grad is not None:
                grad_velo_wp = from_torch_safe(out_velo_grad, dtype=wp.vec3, requires_grad=False)
            else:
                grad_velo_wp = None
            
            if out_F_grad is not None:
                grad_F_wp = from_torch_safe(out_F_grad, dtype=wp.mat33, requires_grad=False)
            else:
                grad_F_wp = None
            
            if out_C_grad is not None:
                grad_C_wp = from_torch_safe(out_C_grad, dtype=wp.mat33, requires_grad=False)
            else:
                grad_C_wp = None

            with tape:

                wp.launch(
                    kernel=get_float_array_product,
                    dim=num_particles,
                    inputs=[
                        prev_state.particle_density,
                        prev_state.particle_vol,
                        prev_state.particle_mass,
                    ],
                    device=device,
                )
                mpm_solver.prepare_mu_lam(mpm_model, prev_state, device=device)

                for substep_local in range(num_substeps):
                    next_state = prev_state.partial_clone(requires_grad=True)
                    mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)

                    # next_state = mpm_solver.p2g2p_differentiable(mpm_model, prev_state, substep_size, device=device)
                    next_state_list.append(next_state)
                    prev_state = next_state

                # simulation done. Compute loss:
                
                loss_wp = torch.zeros(1, device=device)
                loss_wp = wp.from_torch(loss_wp, requires_grad=True)
                target_pos_detach = wp.clone(next_state.particle_x, device=device, requires_grad=False)
                wp.launch(
                    compute_posloss_with_grad, 
                    dim=num_particles,
                    inputs=[
                        next_state,
                        target_pos_detach,
                        grad_pos_wp,
                        0.5,
                        loss_wp,
                    ],
                    device=device,
                )
                if grad_velo_wp is not None:
                    target_velo_detach = wp.clone(next_state.particle_v, device=device, requires_grad=False)
                    wp.launch(
                        compute_veloloss_with_grad, 
                        dim=num_particles,
                        inputs=[
                            next_state,
                            target_velo_detach,
                            grad_velo_wp,
                            0.5,
                            loss_wp,
                        ],
                        device=device,
                    )
                
                if grad_F_wp is not None:
                    target_F_detach = wp.clone(next_state.particle_F_trial, device=device, requires_grad=False)
                    wp.launch(
                        compute_Floss_with_grad, 
                        dim=num_particles,
                        inputs=[
                            next_state,
                            target_F_detach,
                            grad_F_wp,
                            0.5,
                            loss_wp,
                        ],
                        device=device,
                    )
                if grad_C_wp is not None:
                    target_C_detach = wp.clone(next_state.particle_C, device=device, requires_grad=False)
                    wp.launch(
                        compute_Closs_with_grad, 
                        dim=num_particles,
                        inputs=[
                            next_state,
                            target_C_detach,
                            grad_C_wp,
                            0.5,
                            loss_wp,
                        ],
                        device=device,)

            # wp.synchronize_device(device)            
            tape.backward(loss_wp)
            # from IPython import embed; embed()

        pos_grad = wp.to_torch(starting_state.particle_x.grad).detach().clone()
        velo_grad = wp.to_torch(starting_state.particle_v.grad).detach().clone()
        F_grad = wp.to_torch(starting_state.particle_F_trial.grad).detach().clone()
        C_grad = wp.to_torch(starting_state.particle_C.grad).detach().clone()
        # print("debug back", velo_grad)

        # grad for E, nu. TODO: add spatially varying E, nu later
        if ctx.aggregating_E:
            E_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
            wp.launch(
                aggregate_grad,
                dim=num_particles,
                inputs=[
                    E_grad,
                    mpm_model.E.grad,
                ],
                device=device,
            )
            E_grad = wp.to_torch(E_grad)[0] / num_particles
        else:
            E_grad = wp.to_torch(mpm_model.E.grad).detach().clone()

        if ctx.aggregating_nu:
            nu_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
            wp.launch(
                aggregate_grad,
                dim=num_particles,
                inputs=[nu_grad, mpm_model.nu.grad],
                device=device,
            )
            nu_grad = wp.to_torch(nu_grad)[0] / num_particles   
        else:
            nu_grad = wp.to_torch(mpm_model.nu.grad).detach().clone()

        # grad for density
        if starting_state.particle_density.grad is None:
            density_grad = None
        else:
            density_grad = wp.to_torch(starting_state.particle_density.grad).detach()

        
        density_mask_grad = None
        static_pos_grad = None 

        tape.zero()
        # print(density_grad.abs().sum(), velo_grad.abs().sum(), E_grad.abs().item(), nu_grad.abs().item(), "in sim func")
        # from IPython import embed; embed()
        
        return (None, None, None, None, None,
                pos_grad, velo_grad, F_grad, C_grad, 
                E_grad, nu_grad,
                density_grad, density_mask_grad, 
                None, None, None)


class MPMDifferentiableSimulationClean(autograd.Function):
    """
    Current version does not support grad for density. 
    Please set vol, mass before calling this function.
    """

    @staticmethod
    @torch.no_grad()
    def forward(
        ctx: autograd.function.FunctionCtx,
        mpm_solver: MPMWARPDiff,
        mpm_state: MPMStateStruct,
        mpm_model: MPMModelStruct,
        substep_size: float, 
        num_substeps: int,
        particle_x: Float[Tensor, "n 3"], 
        particle_v: Float[Tensor, "n 3"],
        particle_F: Float[Tensor, "n 3 3"],
        particle_C: Float[Tensor, "n 3 3"],
        E: Float[Tensor, "n"] | Float[Tensor, "1"],
        nu: Float[Tensor, "n"] | Float[Tensor, "1"],
        particle_density: Optional[Float[Tensor, "n"] | Float[Tensor, "1"]]=None,
        query_mask: Optional[Int[Tensor, "n"]] = None,
        device: str="cuda:0",
        requires_grad: bool=True,
        extra_no_grad_steps: int=0,
    ) -> Tuple[Float[Tensor, "n 3"], Float[Tensor, "n 3"], Float[Tensor, "n 9"], Float[Tensor, "n 9"], Float[Tensor, "n 6"]]:
        """
        Args:
            query_mask: [n] 0 or 1.  1 means the density or young's modulus, or poisson'ratio of this particle can change.
        """
        
        # initialization work is done before calling forward! 

        num_particles = particle_x.shape[0]

        mpm_state.continue_from_torch(
            particle_x, particle_v, particle_F, particle_C, device=device, requires_grad=True
        )
        # set x, v, F, C.

        if E.ndim == 0:
            E_inp = E.item() # float
            ctx.aggregating_E = True
        else:
            E_inp = from_torch_safe(E, dtype=wp.float32, requires_grad=True)
            ctx.aggregating_E = False
        if nu.ndim == 0:
            nu_inp = nu.item() # float
            ctx.aggregating_nu = True
        else:
            nu_inp = from_torch_safe(nu, dtype=wp.float32, requires_grad=True)
            ctx.aggregating_nu = False
            
        mpm_solver.set_E_nu(mpm_model, E_inp, nu_inp, device=device)
        mpm_solver.prepare_mu_lam(mpm_model, mpm_state, device=device)

        mpm_state.reset_density(
            tensor_density=particle_density,
            selection_mask=query_mask,
            device=device,
            requires_grad=True,
            update_mass=True)
        
        prev_state = mpm_state

        if extra_no_grad_steps > 0:
            with torch.no_grad():
                for i in range(extra_no_grad_steps):
                    next_state = prev_state.partial_clone(requires_grad=True)
                    mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)
                    prev_state = next_state

        # following steps will be checkpointed. then replayed in backward
        ctx.prev_state = prev_state

        wp_tape = MyTape()
        cond_tape: CondTape = CondTape(wp_tape, requires_grad)
        next_state_list = [] 

        with cond_tape:
            wp.launch(
                kernel=get_float_array_product,
                dim=num_particles,
                inputs=[
                    prev_state.particle_density,
                    prev_state.particle_vol,
                    prev_state.particle_mass,
                ],
                device=device,
            )
            mpm_solver.prepare_mu_lam(mpm_model, prev_state, device=device)

            for substep_local in range(num_substeps):
                next_state = prev_state.partial_clone(requires_grad=True)
                mpm_solver.p2g2p_differentiable(mpm_model, prev_state, next_state, substep_size, device=device)
                next_state_list.append(next_state)
                prev_state = next_state
        
        ctx.mpm_solver = mpm_solver
        ctx.mpm_model = mpm_model
        ctx.next_state_list = next_state_list
        ctx.device = device
        ctx.num_particles = num_particles
        ctx.tape = cond_tape.tape

        ctx.save_for_backward(query_mask)

        last_state = next_state
        particle_pos = wp.to_torch(last_state.particle_x).detach().clone()
        particle_velo = wp.to_torch(last_state.particle_v).detach().clone()
        particle_F = wp.to_torch(last_state.particle_F_trial).detach().clone()
        particle_C = wp.to_torch(last_state.particle_C).detach().clone()
        # [N * 6, ]
        particle_cov = wp.to_torch(last_state.particle_cov).detach().clone()

        particle_cov = particle_cov.view(-1, 6)

        return particle_pos, particle_velo, particle_F, particle_C, particle_cov
    

    @staticmethod
    def backward(ctx, out_pos_grad: Float[Tensor, "n 3"], out_velo_grad: Float[Tensor, "n 3"], 
                 out_F_grad: Float[Tensor, "n 9"], out_C_grad: Float[Tensor, "n 9"], out_cov_grad: Float[Tensor, "n 6"]):
        
        num_particles = ctx.num_particles
        device = ctx.device
        mpm_solver, mpm_model = ctx.mpm_solver, ctx.mpm_model
        tape = ctx.tape
        starting_state = ctx.prev_state
        
        next_state_list = ctx.next_state_list
        next_state = next_state_list[-1]

        query_mask = ctx.saved_tensors
    
        with wp.ScopedDevice(device):
            
            grad_pos_wp = from_torch_safe(out_pos_grad, dtype=wp.vec3, requires_grad=False)
            
            with tape:
                loss_wp = torch.zeros(1, device=device)
                loss_wp = wp.from_torch(loss_wp, requires_grad=True)
                target_pos_detach = wp.clone(next_state.particle_x, device=device, requires_grad=False)
                wp.launch(
                    compute_posloss_with_grad, 
                    dim=num_particles,
                    inputs=[
                        next_state,
                        target_pos_detach,
                        grad_pos_wp,
                        0.5,
                        loss_wp,
                    ],
                    device=device,
                )

            # wp.synchronize_device(device)            
            tape.backward(loss_wp)
            # from IPython import embed; embed()

        pos_grad = wp.to_torch(starting_state.particle_x.grad).detach().clone()
        velo_grad = wp.to_torch(starting_state.particle_v.grad).detach().clone()
        F_grad = wp.to_torch(starting_state.particle_F_trial.grad).detach().clone()
        C_grad = wp.to_torch(starting_state.particle_C.grad).detach().clone()
        # print("debug back", velo_grad)

        # grad for E, nu. TODO: add spatially varying E, nu later
        if ctx.aggregating_E:
            E_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
            wp.launch(
                aggregate_grad,
                dim=num_particles,
                inputs=[
                    E_grad,
                    mpm_model.E.grad,
                ],
                device=device,
            )
            E_grad = wp.to_torch(E_grad)[0] / num_particles
        else:
            E_grad = wp.to_torch(mpm_model.E.grad).detach().clone()

        if ctx.aggregating_nu:
            nu_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
            wp.launch(
                aggregate_grad,
                dim=num_particles,
                inputs=[nu_grad, mpm_model.nu.grad],
                device=device,
            )
            nu_grad = wp.to_torch(nu_grad)[0] / num_particles   
        else:
            nu_grad = wp.to_torch(mpm_model.nu.grad).detach().clone()

        # grad for density
        if starting_state.particle_density.grad is None:
            density_grad = None
        else:
            density_grad = wp.to_torch(starting_state.particle_density.grad).detach()
        density_mask_grad = None

        tape.zero()
        # print(density_grad.abs().sum(), velo_grad.abs().sum(), E_grad.abs().item(), nu_grad.abs().item(), "in sim func")
        # from IPython import embed; embed()
        
        return (None, None, None, None, None,
                pos_grad, velo_grad, F_grad, C_grad, 
                E_grad, nu_grad,
                density_grad, density_mask_grad, 
                None, None, None)

================================================
FILE: projects/uncleaned_train/exp_motion/train/local_utils.py
================================================
import os
import torch
from jaxtyping import Float, Int, Shaped
from torch import Tensor
from time import time
from omegaconf import OmegaConf
from motionrep.fields.se3_field import TemporalKplanesSE3fields
from motionrep.fields.triplane_field import TriplaneFields, TriplaneFieldsWithEntropy
from motionrep.utils.svd_helpper import load_model_from_config

from motionrep.gaussian_3d.gaussian_renderer.render import (
    render_gaussian,
    render_arrow_in_screen,
)
from motionrep.gaussian_3d.gaussian_renderer.flow_depth_render import (
    render_flow_depth_w_gaussian,
)
import cv2
import numpy as np
from sklearn.cluster import KMeans
from time import time

from motionrep.gaussian_3d.utils.rigid_body_utils import (
    get_rigid_transform,
    matrix_to_quaternion,
    quaternion_multiply,
)


def cycle(dl: torch.utils.data.DataLoader):
    while True:
        for data in dl:
            yield data


def load_motion_model(model, checkpoint_path):
    model_path = os.path.join(checkpoint_path, "model.pt")
    model.load_state_dict(torch.load(model_path))
    print("load model from: ", model_path)
    return model


def create_spatial_fields(
    args, output_dim, aabb: Float[Tensor, "2 3"], add_entropy=True
):

    sp_res = args.sim_res

    resolutions = [sp_res, sp_res, sp_res]
    reduce = "sum"

    if args.entropy_cls > 0 and add_entropy:
        model = TriplaneFieldsWithEntropy(
            aabb,
            resolutions,
            feat_dim=32,
            init_a=0.1,
            init_b=0.5,
            reduce=reduce,
            num_decoder_layers=2,
            decoder_hidden_size=32,
            output_dim=output_dim,
            zero_init=args.zero_init,
            num_cls=args.entropy_cls,
        )
    else:
        model = TriplaneFields(
            aabb,
            resolutions,
            feat_dim=32,
            init_a=0.1,
            init_b=0.5,
            reduce=reduce,
            num_decoder_layers=2,
            decoder_hidden_size=32,
            output_dim=output_dim,
            zero_init=args.zero_init,
        )
    if args.zero_init:
        print("=> zero init the last layer for Spatial MLP")

    return model


def create_motion_model(
    args,
    aabb: Float[Tensor, "2 3"],
    num_frames=None,
):
    assert args.model in ["se3_field"]

    sp_res = args.spatial_res
    if num_frames is None:
        num_frames = args.num_frames
    resolutions = [sp_res, sp_res, sp_res, (num_frames) // 2 + 1]
    # resolutions = [64, 64, 64, num_frames // 2 + 1]
    reduce = "sum"

    model = TemporalKplanesSE3fields(
        aabb,
        resolutions,
        feat_dim=args.feat_dim,
        init_a=0.1,
        init_b=0.5,
        reduce=reduce,
        num_decoder_layers=args.num_decoder_layers,
        decoder_hidden_size=args.decoder_hidden_size,
        zero_init=args.zero_init,
    )
    if args.zero_init:
        print("=> zero init the last layer for MLP")

    return model


def create_velocity_model(
    args,
    aabb: Float[Tensor, "2 3"],
):

    from motionrep.fields.offset_field import TemporalKplanesOffsetfields

    sp_res = args.sim_res
    resolutions = [sp_res, sp_res, sp_res, (args.num_frames) // 2 + 1]
    reduce = "sum"
    model = TemporalKplanesOffsetfields(
        aabb,
        resolutions,
        feat_dim=32,
        init_a=0.1,
        init_b=0.5,
        reduce=reduce,
        num_decoder_layers=2,
        decoder_hidden_size=32,
        zero_init=args.zero_init,
    )
    if args.zero_init:
        print("=> zero init the last layer for velocity MLP")
    return model


def create_svd_model(model_name="svd_full", ckpt_path=None):
    state = dict()
    cfg_path_dict = {
        "svd_full": "svd_configs/svd_full_decoder.yaml",
    }
    config = cfg_path_dict[model_name]

    config = OmegaConf.load(config)

    if ckpt_path is not None:
        # overwrite config.
        config.model.params.ckpt_path = ckpt_path

    s_time = time()
    # model will automatically load when create
    model, msg = load_model_from_config(config, None)

    state["config"] = config

    print(f"Loading svd model takes {time() - s_time} seconds")

    return model, state


class LinearStepAnneal(object):
    # def __init__(self, total_iters, start_state=[0.02, 0.98], end_state=[0.50, 0.98]):
    def __init__(
        self,
        total_iters,
        start_state=[0.02, 0.98],
        end_state=[0.02, 0.98],
        plateau_iters=-1,
        warmup_step=300,
    ):
        self.total_iters = total_iters

        if plateau_iters < 0:
            plateau_iters = int(total_iters * 0.2)

        if warmup_step <= 0:
            warmup_step = 0

        self.total_iters = max(total_iters - plateau_iters - warmup_step, 10)

        self.start_state = start_state
        self.end_state = end_state
        self.warmup_step = warmup_step

    def compute_state(self, cur_iter):

        if self.warmup_step > 0:
            cur_iter = max(0, cur_iter - self.warmup_step)
        if cur_iter >= self.total_iters:
            return self.end_state
        ret = []
        for s, e in zip(self.start_state, self.end_state):
            ret.append(s + (e - s) * cur_iter / self.total_iters)
        return ret


def setup_boundary_condition(
    xyzs_over_time: torch.Tensor, mpm_solver, mpm_state, num_filled=0
):

    init_velocity = xyzs_over_time[1] - xyzs_over_time[0]
    init_velocity_mag = torch.norm(init_velocity, dim=-1)

    # 10% of the velocity
    velocity_thres = torch.quantile(init_velocity_mag, 0.1, dim=0)

    # [n_particles]. 1 for freeze, 0 for moving
    freeze_mask = init_velocity_mag < velocity_thres
    freeze_mask = freeze_mask.type(torch.int)
    if num_filled > 0:
        freeze_mask = torch.cat(
            [freeze_mask, freeze_mask.new_zeros(num_filled).type(torch.int)], dim=0
        )
    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from static points", num_freeze_pts.item())

    free_velocity = torch.zeros_like(init_velocity[0])  # [3] in device

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=100000
    )

    return freeze_mask


def setup_plannar_boundary_condition(
    xyzs_over_time: torch.Tensor,
    mpm_solver,
    mpm_state,
    gaussian_xyz,
    plane_mean,
    plane_normal,
    thres=0.2,
):
    """
    plane_mean and plane_normal are in original coordinate, not being normalized
    Args:
        xyzs_over_time: [T, N, 3]
        gaussian_xyz: [N, 3] torch.Tensor
        plane_mean: [3]
        plane_normal: [3]
        thres: float

    """

    plane_normal = plane_normal / torch.norm(plane_normal)
    # [n_particles]
    plane_dist = torch.abs(
        torch.sum(
            (gaussian_xyz - plane_mean.unsqueeze(0)) * plane_normal.unsqueeze(0), dim=-1
        )
    )
    # [n_particles]
    freeze_mask = plane_dist < thres
    freeze_mask = freeze_mask.type(torch.int)

    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from plannar boundary", num_freeze_pts.item())
    free_velocity = xyzs_over_time.new_zeros(3)
    # print("free velocity", free_velocity.shape, freeze_mask.shape)

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=100000
    )

    return freeze_mask


def find_far_points(xyzs, selected_points, thres=0.05):
    """
    Args:
        xyzs: [N, 3]
        selected_points: [M, 3]
    Outs:
        freeze_mask: [N], 1 for points that are far away, 0 for points that are close
                    dtype=torch.int
    """
    chunk_size = 10000

    freeze_mask_list = []
    for i in range(0, xyzs.shape[0], chunk_size):

        end_index = min(i + chunk_size, xyzs.shape[0])
        xyzs_chunk = xyzs[i:end_index]
        # [M, N]
        cdist = torch.cdist(xyzs_chunk, selected_points)

        min_dist, _ = torch.min(cdist, dim=-1)
        freeze_mask = min_dist > thres
        freeze_mask = freeze_mask.type(torch.int)
        freeze_mask_list.append(freeze_mask)

    freeze_mask = torch.cat(freeze_mask_list, dim=0)

    # 1 for points that are far away, 0 for points that are close
    return freeze_mask


def setup_boundary_condition_with_points(
    xyzs, selected_points, mpm_solver, mpm_state, thres=0.05
):
    """
    Args:
        xyzs: [N, 3]
        selected_points: [M, 3]
    """

    freeze_mask = find_far_points(xyzs, selected_points, thres=thres)
    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from static points", num_freeze_pts.item())

    free_velocity = torch.zeros_like(xyzs[0])  # [3] in device

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=1000000
    )

    return freeze_mask


def setup_bottom_boundary_condition(xyzs, mpm_solver, mpm_state, percentile=0.05):
    """
    Args:
        xyzs: [N, 3]
        selected_points: [M, 3]
    """
    max_z, min_z = torch.max(xyzs[:, 2]), torch.min(xyzs[:, 2])
    thres = min_z + (max_z - min_z) * percentile
    freeze_mask = xyzs[:, 2] < thres

    freeze_mask = freeze_mask.type(torch.int)
    num_freeze_pts = freeze_mask.sum()
    print("num freeze pts from bottom points", num_freeze_pts.item())

    free_velocity = torch.zeros_like(xyzs[0])  # [3] in device

    mpm_solver.enforce_particle_velocity_by_mask(
        mpm_state, freeze_mask, free_velocity, start_time=0, end_time=1000000
    )

    return freeze_mask


def render_single_view_video(
    cam,
    render_params,
    motion_model,
    time_stamps,
    rand_bg=False,
    render_flow=False,
    query_mask=None,
):
    """
    Args:
        cam:
        motion_model: Callable function, f(x, t) => translation, rotation
        time_stamps: [T]
        query_mask: Tensor of [N], 0 for freeze points, 1 for moving points
    Outs:
        ret_video: [T, 3, H, W] value in [0, 1]
    """

    if rand_bg:
        bg_color = torch.rand(3, device="cuda")
    else:
        bg_color = render_params.bg_color

    ret_img_list = []
    for time_stamp in time_stamps:
        if not render_flow:
            new_gaussians = render_params.gaussians.apply_se3_fields(
                motion_model, time_stamp
            )
            if query_mask is not None:
                new_gaussians._xyz = new_gaussians._xyz * query_mask.unsqueeze(
                    -1
                ) + render_params.gaussians._xyz * (1 - query_mask.unsqueeze(-1))
                new_gaussians._rotation = (
                    new_gaussians._rotation * query_mask.unsqueeze(-1)
                    + render_params.gaussians._rotation * (1 - query_mask.unsqueeze(-1))
                )
            # [3, H, W]
            img = render_gaussian(
                cam,
                new_gaussians,
                render_params.render_pipe,
                bg_color,
            )[
                "render"
            ]  # value in [0, 1]
        else:
            inp_time = (
                torch.ones_like(render_params.gaussians._xyz[:, 0:1]) * time_stamp
            )
            inp = torch.cat([render_params.gaussians._xyz, inp_time], dim=-1)
            # [bs, 3, 3]. [bs, 3]
            R, point_disp = motion_model(inp)

            img = render_flow_depth_w_gaussian(
                cam,
                render_params.gaussians,
                render_params.render_pipe,
                point_disp,
                bg_color,
            )["render"]

        ret_img_list.append(img[None, ...])

    ret_video = torch.cat(ret_img_list, dim=0)  # [T, 3, H, W]
    return ret_video


def render_gaussian_seq(cam, render_params, gaussian_pos_list, gaussian_cov_list):

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz = xyz
        # TODO, how to deal with cov
        img = render_gaussian(
            cam,
            gaussians,
            render_params.render_pipe,
            background,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def render_gaussian_seq_w_mask(
    cam, render_params, gaussian_pos_list, gaussian_cov_list, update_mask
):

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_cov = gaussians.get_covariance().clone()

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz[update_mask, ...] = xyz

        if gaussian_cov_list is not None:
            cov = gaussian_cov_list[i]
            old_cov[update_mask, ...] = cov
            cov3D_precomp = old_cov

        else:
            cov3D_precomp = None

        img = render_gaussian(
            cam,
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def render_gaussian_seq_w_mask_with_disp(
    cam, render_params, orign_points, top_k_index, disp_list, update_mask
):
    """
    Args:
        cam: Camera or list of Camera
        orign_points: [m, 3]
        disp_list: List[m, 3]
        top_k_index: [n, top_k]

    """

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_rotation = gaussians._rotation.clone()

    query_pts = old_xyz[update_mask, ...]
    query_rotation = old_rotation[update_mask, ...]

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(disp_list)):

        if isinstance(cam, list):
            render_cam = cam[i]
        else:
            render_cam = cam
        disp = disp_list[i]
        new_xyz, new_rotation = interpolate_points_w_R(
            query_pts, query_rotation, orign_points, disp, top_k_index
        )
        gaussians._xyz[update_mask, ...] = new_xyz
        gaussians._rotation[update_mask, ...] = new_rotation

        cov3D_precomp = None

        img = render_gaussian(
            render_cam,
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    gaussians._rotation = old_rotation
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def render_gaussian_seq_w_mask_with_disp_for_figure(
    cam, render_params, orign_points, top_k_index, disp_list, update_mask
):
    """
    Args:
        cam: Camera or list of Camera
        orign_points: [m, 3]
        disp_list: List[m, 3]
        top_k_index: [n, top_k]

    """

    ret_img_list = []
    moving_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_rotation = gaussians._rotation.clone()

    query_pts = old_xyz[update_mask, ...]
    query_rotation = old_rotation[update_mask, ...]

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    background_black = torch.tensor([0, 0, 0], dtype=torch.float32, device="cuda")
    for i in range(len(disp_list)):

        if isinstance(cam, list):
            render_cam = cam[i]
        else:
            render_cam = cam
        disp = disp_list[i]
        new_xyz, new_rotation = interpolate_points_w_R(
            query_pts, query_rotation, orign_points, disp, top_k_index
        )
        gaussians._xyz[update_mask, ...] = new_xyz
        gaussians._rotation[update_mask, ...] = new_rotation

        cov3D_precomp = None

        img = render_gaussian(
            render_cam,
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        masked_gaussians = gaussians.apply_mask(update_mask)
        moving_img = render_gaussian(
            render_cam,
            masked_gaussians,
            render_params.render_pipe,
            background_black,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])
        moving_img_list.append(moving_img[None, ...])

    gaussians._xyz = old_xyz  # set back
    gaussians._rotation = old_rotation
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)
    moving_part_video = torch.cat(moving_img_list, dim=0)

    return rendered_video, moving_part_video


def render_gaussian_seq_w_mask_cam_seq(
    cam_list, render_params, gaussian_pos_list, gaussian_cov_list, update_mask
):

    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_cov = gaussians.get_covariance().clone()

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz[update_mask, ...] = xyz

        if gaussian_cov_list is not None:
            cov = gaussian_cov_list[i]
            old_cov[update_mask, ...] = cov
            cov3D_precomp = old_cov

        else:
            cov3D_precomp = None

        img = render_gaussian(
            cam_list[i],
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = torch.cat(ret_img_list, dim=0)

    return rendered_video


def apply_grid_bc_w_freeze_pts(grid_size, grid_lim, freeze_pts, mpm_solver):

    device = freeze_pts.device

    grid_pts_cnt = torch.zeros(
        (grid_size, grid_size, grid_size), dtype=torch.int32, device=device
    )

    dx = grid_lim / grid_size
    inv_dx = 1.0 / dx

    freeze_pts = (freeze_pts * inv_dx).long()

    for x, y, z in freeze_pts:
        grid_pts_cnt[x, y, z] += 1

    freeze_grid_mask = grid_pts_cnt >= 1

    freeze_grid_mask_int = freeze_grid_mask.type(torch.int32)

    number_freeze_grid = freeze_grid_mask_int.sum().item()
    print("number of freeze grid", number_freeze_grid)

    mpm_solver.enforce_grid_velocity_by_mask(freeze_grid_mask_int)

    # add debug section:

    return freeze_grid_mask


def add_constant_force(
    mpm_sovler,
    mpm_state,
    xyzs,
    center_point,
    radius,
    force,
    dt,
    start_time,
    end_time,
    device,
):
    """
    Args:
        xyzs: [N, 3]
        center_point: [3]
        radius: float
        force: [3]

    """

    # compute distance from xyzs to center_point
    # [N]
    dist = torch.norm(xyzs - center_point.unsqueeze(0), dim=-1)

    apply_force_mask = dist < radius
    apply_force_mask = apply_force_mask.type(torch.int)

    print(apply_force_mask.shape, apply_force_mask.sum().item(), "apply force mask")

    mpm_sovler.add_impulse_on_particles_with_mask(
        mpm_state,
        force,
        dt,
        apply_force_mask,
        start_time=start_time,
        end_time=end_time,
        device=device,
    )


@torch.no_grad()
def render_force_2d(cam, render_params, center_point, force):

    force_in_2d_scale = 80  # unit as pixel
    two_points = torch.stack([center_point, center_point + force], dim=0)

    gaussians = render_params.gaussians
    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")

    # [3, H, W]
    img = render_gaussian(
        cam,
        gaussians,
        render_params.render_pipe,
        background,
    )["render"]
    img = img.detach().contiguous()
    img = img.cpu().numpy().transpose(1, 2, 0)
    img = img * 255
    img = img.astype(np.uint8).copy()

    # two_points.  [2, 3]
    # arrow_2d: [2, 2]
    arrow_2d = render_arrow_in_screen(cam, two_points)

    arrow_2d = arrow_2d.cpu().numpy()

    start, vec_2d = arrow_2d[0], arrow_2d[1] - arrow_2d[0]
    vec_2d = vec_2d / np.linalg.norm(vec_2d)

    start = start  # + np.array([540.0, 288.0])  # [W, H] / 2
    # debug here.
    # 1. unit in pixel?
    # 2. use cv2 to add arrow?
    # draw cirrcle at start in img

    # img = img.transpose(2, 0, 1)
    img = cv2.circle(img, (int(start[0]), int(start[1])), 40, (255, 255, 255), 8)

    # draw arrow in img
    end = start + vec_2d * force_in_2d_scale
    end = end.astype(np.int32)
    start = start.astype(np.int32)
    img = cv2.arrowedLine(img, (start[0], start[1]), (end[0], end[1]), (0, 255, 255), 8)

    return img


def render_gaussian_seq_w_mask_cam_seq_with_force(
    cam_list,
    render_params,
    gaussian_pos_list,
    gaussian_cov_list,
    update_mask,
    pts_index,
    force,
    force_steps,
):

    force_in_2d_scale = 80  # unit as pixel
    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_cov = gaussians.get_covariance().clone()

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(gaussian_pos_list)):

        xyz = gaussian_pos_list[i]
        gaussians._xyz[update_mask, ...] = xyz

        if gaussian_cov_list is not None:
            cov = gaussian_cov_list[i]
            old_cov[update_mask, ...] = cov
            cov3D_precomp = old_cov

        else:
            cov3D_precomp = None

        img = render_gaussian(
            cam_list[i],
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        # to [H, W, 3]
        img = img.detach().contiguous().cpu().numpy().transpose(1, 2, 0)
        img = np.clip((img * 255), 0, 255).astype(np.uint8).copy()

        if i < force_steps:
            center_point = gaussians._xyz[pts_index]
            two_points = torch.stack([center_point, center_point + force], dim=0)

            arrow_2d = render_arrow_in_screen(cam_list[i], two_points)

            arrow_2d = arrow_2d.cpu().numpy()

            start, vec_2d = arrow_2d[0], arrow_2d[1] - arrow_2d[0]
            vec_2d = vec_2d / np.linalg.norm(vec_2d)

            start = start  # + np.array([540.0, 288.0])

            img = cv2.circle(
                img, (int(start[0]), int(start[1])), 40, (255, 255, 255), 8
            )

            # draw arrow in img
            end = start + vec_2d * force_in_2d_scale
            end = end.astype(np.int32)
            start = start.astype(np.int32)
            img = cv2.arrowedLine(
                img, (start[0], start[1]), (end[0], end[1]), (0, 255, 255), 8
            )

        img = img.transpose(2, 0, 1)
        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    # [T, C, H, W], in [0, 1]
    rendered_video = np.concatenate(ret_img_list, axis=0)

    return rendered_video


def render_gaussian_seq_w_mask_cam_seq_with_force_with_disp(
    cam_list,
    render_params,
    orign_points,
    top_k_index,
    disp_list,
    update_mask,
    pts_index,
    force,
    force_steps,
):

    force_in_2d_scale = 80  # unit as pixel
    ret_img_list = []
    gaussians = render_params.gaussians
    old_xyz = gaussians._xyz.clone()
    old_rotation = gaussians._rotation.clone()

    query_pts = old_xyz[update_mask, ...]
    query_rotation = old_rotation[update_mask, ...]

    background = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
    for i in range(len(disp_list)):

        disp = disp_list[i]
        new_xyz, new_rotation = interpolate_points_w_R(
            query_pts, query_rotation, orign_points, disp, top_k_index
        )
        gaussians._xyz[update_mask, ...] = new_xyz
        gaussians._rotation[update_mask, ...] = new_rotation

        cov3D_precomp = None

        img = render_gaussian(
            cam_list[i],
            gaussians,
            render_params.render_pipe,
            background,
            cov3D_precomp=cov3D_precomp,
        )["render"]

        # to [H, W, 3]
        img = img.detach().contiguous().cpu().numpy().transpose(1, 2, 0)
        img = np.clip((img * 255), 0, 255).astype(np.uint8).copy()

        if i < force_steps:
            center_point = gaussians._xyz[pts_index]
            two_points = torch.stack([center_point, center_point + force], dim=0)

            arrow_2d = render_arrow_in_screen(cam_list[i], two_points)

            arrow_2d = arrow_2d.cpu().numpy()

            start, vec_2d = arrow_2d[0], arrow_2d[1] - arrow_2d[0]
            vec_2d = vec_2d / np.linalg.norm(vec_2d)

            start = start  # + np.array([540.0, 288.0])

            img = cv2.circle(
                img, (int(start[0]), int(start[1])), 40, (255, 255, 255), 5
            )

            # draw arrow in img
            end = start + vec_2d * force_in_2d_scale
            end = end.astype(np.int32)
            start = start.astype(np.int32)
            img = cv2.arrowedLine(
                img, (start[0], start[1]), (end[0], end[1]), (255, 255, 0), 4
            )

        img = img.transpose(2, 0, 1)
        ret_img_list.append(img[None, ...])

    gaussians._xyz = old_xyz  # set back
    gaussians._rotation = old_rotation
    # [T, C, H, W], in [0, 1]
    rendered_video = np.concatenate(ret_img_list, axis=0)

    return rendered_video


def downsample_with_kmeans(points_array: np.ndarray, num_points: int):
    """
    Args:
        points_array: [N, 3]
        num_points: int
    Outs:
        downsampled_points: [num_points, 3]
    """

    print(
        "=> staring downsample with kmeans from ",
        points_array.shape[0],
        " points to ",
        num_points,
        " points",
    )
    s_time = time()
    kmeans = KMeans(n_clusters=num_points, random_state=0).fit(points_array)
    cluster_centers = kmeans.cluster_centers_
    e_time = time()

    print("=> downsample with kmeans takes ", e_time - s_time, " seconds")
    return cluster_centers


@torch.no_grad()
def downsample_with_kmeans_gpu(points_array: torch.Tensor, num_points: int):

    from kmeans_gpu import KMeans

    kmeans = KMeans(
        n_clusters=num_points,
        max_iter=100,
        tolerance=1e-4,
        distance="euclidean",
        sub_sampling=None,
        max_neighbors=15,
    )

    features = torch.ones(1, 1, points_array.shape[0], device=points_array.device)
    points_array = points_array.unsqueeze(0)
    # Forward

    print(
        "=> staring downsample with kmeans from ",
        points_array.shape[1],
        " points to ",
        num_points,
        " points",
    )
    s_time = time()
    centroids, features = kmeans(points_array, features)

    ret_points = centroids.squeeze(0)
    e_time = time()
    print("=> downsample with kmeans takes ", e_time - s_time, " seconds")

    # [np_subsample, 3]
    return ret_points


def interpolate_points(query_points, drive_displacement, top_k_index):
    """
    Args:
        query_points: [n, 3]
        drive_displacement: [m, 3]
        top_k_index: [n, top_k] < m
    """

    top_k_disp = drive_displacement[top_k_index]

    t = top_k_disp.mean(dim=1)

    ret_points = query_points + t

    return ret_points


def interpolate_points_w_R(
    query_points, query_rotation, drive_origin_pts, drive_displacement, top_k_index
):
    """
    Args:
        query_points: [n, 3]
        drive_origin_pts: [m, 3]
        drive_displacement: [m, 3]
        top_k_index: [n, top_k] < m

    Or directly call: apply_discrete_offset_filds_with_R(self, origin_points, offsets, topk=6):
        Args:
            origin_points: (N_r, 3)
            offsets: (N_r, 3)
        in rendering
    """

    # [n, topk, 3]
    top_k_disp = drive_displacement[top_k_index]
    source_points = drive_origin_pts[top_k_index]

    R, t = get_rigid_transform(source_points, source_points + top_k_disp)

    avg_offsets = top_k_disp.mean(dim=1)

    ret_points = query_points + avg_offsets

    new_rotation = quaternion_multiply(matrix_to_quaternion(R), query_rotation)

    return ret_points, new_rotation


def create_camera_path(
    cam,
    radius: float,
    focus_pt: np.ndarray = np.array([0, 0, 0]),
    up: np.ndarray = np.array([0, 0, 1]),
    n_frames: int = 60,
    n_rots: int = 1,
    y_scale: float = 1.0,
):

    R, T = cam.R, cam.T
    # R, T = R.cpu().numpy(), T.cpu().numpy()

    Rt = np.zeros((4, 4))
    Rt[:3, :3] = R.transpose()
    Rt[:3, 3] = T
    Rt[3, 3] = 1.0
    C2W = np.linalg.inv(Rt)
    C2W[:3, 1:3] *= -1

    import copy
    from motionrep.utils.camera_utils import generate_spiral_path
    from motionrep.data.cameras import Camera

    lookat_pt = focus_pt
    render_poses = generate_spiral_path(
        C2W, radius, lookat_pt, up, n_frames, n_rots, y_scale
    )

    FoVy, FoVx = cam.FoVy, cam.FoVx
    height, width = cam.image_height, cam.image_width

    ret_cam_list = []
    for i in range(n_frames):
        c2w_opengl = render_poses[i]
        c2w = copy.deepcopy(c2w_opengl)
        c2w[:3, 1:3] *= -1

        # get the world-to-camera transform and set R, T
        w2c = np.linalg.inv(c2w)
        R = np.transpose(
            w2c[:3, :3]
        )  # R is stored transposed due to 'glm' in CUDA code
        T = w2c[:3, 3]
        cam = Camera(
            R=R,
            T=T,
            FoVy=FoVy,
            FoVx=FoVx,
            img_path=None,
            img_hw=(height, width),
            timestamp=None,
            data_device="cuda",
        )
        ret_cam_list.append(cam)

    return ret_cam_list


def get_camera_trajectory(cam, num_pos, camera_cfg: dict, dataset):
    if camera_cfg["type"] == "spiral":
        interpolated_cameras = create_camera_path(
            cam,
            radius=camera_cfg["radius"],
            focus_pt=camera_cfg["focus_point"],
            up=camera_cfg["up"],
            n_frames=num_pos,
        )
    elif camera_cfg["type"] == "interpolation":
        if "start_frame" in camera_cfg and "end_frame" in camera_cfg:
            interpolated_cameras = dataset.interpolate_camera(
                camera_cfg["start_frame"], camera_cfg["end_frame"], num_pos
            )
        else:
            interpolated_cameras = dataset.interpolate_camera(
                camera_cfg["start_frame"], camera_cfg["start_frame"], num_pos
            )

    print(
        "number of simulated frames: ",
        num_pos,
        "num camera viewpoints: ",
        len(interpolated_cameras),
    )
    return interpolated_cameras


================================================
FILE: projects/uncleaned_train/exp_motion/train/model_config.py
================================================
import numpy as np

dataset_dir = "../../data/physics_dreamer/hat_nerfstudio/"
result_dir = "output/hat/results_force"
exp_name = "hat"

model_list = [
    # multiview 64 364
    "../../output/inverse_sim/fast_hat_videos2_sv64-384_init1e5decay_1.0_substep_384_se3_field_lr_0.03_tv_0.0001_iters_200_sw_6_cw_1/seed0/checkpoint_model_000019",
]

focus_point_list = [
    np.array([-0.467188, 0.067178, 0.044333]),  # botton of the background
]

camera_cfg_list = [
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
        "end_frame": "frame_00187.png",  # or 91
    },
    # real captured viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00217.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00001.png",
    },
    # other selected viewpoint
    {
        "type": "interpolation",
        "start_frame": "frame_00079.png",
    },
]

simulate_cfg = {
    "substep": 384,
    "grid_size": 64,
}


points_list = [
    np.array([-0.390069, 0.139051, -0.182607]),  # bottom of the hat
    np.array([-0.404391, 0.184975, -0.001585]),  # middle of the hat
    np.array([-0.289375, 0.034581, 0.062010]),  # left of the hat
    np.array([-0.352060, 0.105737, 0.009359]),  # center of the hat
]

force_directions = [
    np.array([1.0, 0.0, 0]),
    np.array([0.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([1.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([0.0, 1.0, 1.0]),
    np.array([1.0, 1.0, 1.0]),
]

force_directions = np.array(force_directions)
force_directions = force_directions / np.linalg.norm(force_directions, axis=1)[:, None]


================================================
FILE: projects/uncleaned_train/exp_motion/train/train_material.py
================================================
import argparse
import os
import numpy as np
import torch
from tqdm import tqdm

from torch import Tensor
from jaxtyping import Float, Int, Shaped
from typing import List

import point_cloud_utils as pcu

from accelerate.utils import ProjectConfiguration
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from accelerate import Accelerator, DistributedDataParallelKwargs

import numpy as np
import logging
import argparse
import shutil
import wandb
import torch
import os
from motionrep.utils.config import create_config
from motionrep.utils.optimizer import get_linear_schedule_with_warmup
from time import time
from omegaconf import OmegaConf
from PIL import Image
import imageio
import numpy as np

# from motionrep.utils.torch_utils import get_sync_time
from einops import rearrange, repeat

from motionrep.gaussian_3d.gaussian_renderer.feat_render import render_feat_gaussian
from motionrep.gaussian_3d.scene import GaussianModel
from motionrep.fields.se3_field import TemporalKplanesSE3fields

from motionrep.data.datasets.multiview_dataset import MultiviewImageDataset
from motionrep.data.datasets.multiview_video_dataset import (
    MultiviewVideoDataset,
    camera_dataset_collate_fn,
)

from motionrep.data.datasets.multiview_dataset import (
    camera_dataset_collate_fn as camera_dataset_collate_fn_img,
)

from typing import NamedTuple
import torch.nn.functional as F

from motionrep.utils.img_utils import compute_psnr, compute_ssim
from thirdparty_code.warp_mpm.mpm_data_structure import (
    MPMStateStruct,
    MPMModelStruct,
    get_float_array_product,
)
from thirdparty_code.warp_mpm.mpm_solver_diff import MPMWARPDiff
from thirdparty_code.warp_mpm.warp_utils import from_torch_safe
from thirdparty_code.warp_mpm.gaussian_sim_utils import get_volume
import warp as wp
import random

from local_utils import (
    cycle,
    load_motion_model,
    create_motion_model,
    create_spatial_fields,
    find_far_points,
    LinearStepAnneal,
    apply_grid_bc_w_freeze_pts,
    render_gaussian_seq_w_mask_cam_seq,
    downsample_with_kmeans_gpu,
    render_gaussian_seq_w_mask_with_disp,
)
from interface import (
    MPMDifferentiableSimulationWCheckpoint,
    MPMDifferentiableSimulationClean,
)

logger = get_logger(__name__, log_level="INFO")

model_dict = {
    # psnr: 29.9
    # "videos": "../../output/inverse_sim/fast_hat_velopretraindecay_1.0_substep_96_se3_field_lr_0.001_tv_0.01_iters_300_sw_2_cw_2/seed0/checkpoint_model_000299",
    # psnr: 30.25
    "videos": "../../output/inverse_sim/fast_hat_velopretrain_g48-192decay_1.0_substep_192_se3_field_lr_0.003_tv_0.01_iters_300_sw_2_cw_2/seed0/checkpoint_model_000199",
    # psnr: 30.52
    "videos_2": "../../output/inverse_sim/fast_hat_videos2_velopretraindecay_1.0_substep_96_se3_field_lr_0.003_tv_0.01_iters_300_sw_2_cw_2/seed0/checkpoint_model_000199",
}


def create_dataset(args):
    assert args.dataset_res in ["middle", "small", "large"]
    if args.dataset_res == "middle":
        res = [320, 576]
    elif args.dataset_res == "small":
        res = [192, 320]
    elif args.dataset_res == "large":
        res = [576, 1024]
    else:
        raise NotImplementedError

    video_dir_name = "videos"
    video_dir_name = args.video_dir_name

    if args.test_convergence:
        video_dir_name = "simulated_videos"
    dataset = MultiviewVideoDataset(
        args.dataset_dir,
        use_white_background=False,
        resolution=res,
        scale_x_angle=1.0,
        video_dir_name=video_dir_name,
    )

    test_dataset = MultiviewImageDataset(
        args.dataset_dir,
        use_white_background=False,
        resolution=res,
        # use_index=list(range(0, 30, 4)),
        # use_index=[0],
        scale_x_angle=1.0,
        fitler_with_renderd=False,
        load_imgs=False,
    )
    print("len of test dataset", len(test_dataset))
    return dataset, test_dataset


class Trainer:
    def __init__(self, args):
        self.args = args

        self.ssim = args.ssim
        args.warmup_step = int(args.warmup_step * args.gradient_accumulation_steps)
        args.train_iters = int(args.train_iters * args.gradient_accumulation_steps)
        os.environ["WANDB__SERVICE_WAIT"] = "600"
        args.wandb_name += (
            "decay_{}_substep_{}_{}_lr_{}_tv_{}_iters_{}_sw_{}_cw_{}".format(
                args.loss_decay,
                args.substep,
                args.model,
                args.lr,
                args.tv_loss_weight,
                args.train_iters,
                args.start_window_size,
                args.compute_window,
            )
        )

        logging_dir = os.path.join(args.output_dir, args.wandb_name)
        accelerator_project_config = ProjectConfiguration(logging_dir=logging_dir)
        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
        accelerator = Accelerator(
            gradient_accumulation_steps=1,  # args.gradient_accumulation_steps,
            mixed_precision="no",
            log_with="wandb",
            project_config=accelerator_project_config,
            kwargs_handlers=[ddp_kwargs],
        )
        self.gradient_accumulation_steps = args.gradient_accumulation_steps
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO,
        )
        logger.info(accelerator.state, main_process_only=False)

        set_seed(args.seed + accelerator.process_index)
        print("process index", accelerator.process_index)
        if accelerator.is_main_process:
            output_path = os.path.join(logging_dir, f"seed{args.seed}")
            os.makedirs(output_path, exist_ok=True)
            self.output_path = output_path

        self.rand_bg = args.rand_bg
        # setup the dataset
        dataset, test_dataset = create_dataset(args)
        self.test_dataset = test_dataset

        dataset_dir = test_dataset.data_dir
        self.dataset = dataset

        gaussian_path = os.path.join(dataset_dir, "point_cloud.ply")
        aabb = self.setup_eval(
            args,
            gaussian_path,
            white_background=True,
        )
        self.aabb = aabb
        self.model = create_motion_model(
            args,
            aabb=aabb,
            num_frames=9,
        )
        if args.motion_model_path is not None:
            self.model = load_motion_model(self.model, args.motion_model_path)
        self.model.eval()

        self.num_frames = int(args.num_frames)
        self.window_size_schduler = LinearStepAnneal(
            args.train_iters,
            start_state=[args.start_window_size],
            end_state=[13],
            plateau_iters=-1,
            warmup_step=20,
        )

        test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=args.batch_size,
            shuffle=False,
            drop_last=True,
            num_workers=0,
            collate_fn=camera_dataset_collate_fn_img,
        )
        # why prepare here again?
        test_dataloader = accelerator.prepare(test_dataloader)
        self.test_dataloader = cycle(test_dataloader)

        dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=args.batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=0,
            collate_fn=camera_dataset_collate_fn,
        )
        # why prepare here again?
        dataloader = accelerator.prepare(dataloader)
        self.dataloader = cycle(dataloader)

        self.train_iters = args.train_iters
        self.accelerator = accelerator
        # init traiable params
        E_nu_list = self.init_trainable_params()
        for p in E_nu_list:
            p.requires_grad = True
        self.E_nu_list = E_nu_list

        self.model = accelerator.prepare(self.model)
        self.setup_simulation(dataset_dir, grid_size=args.grid_size)

        if args.checkpoint_path == "None":
            args.checkpoint_path = None
        if args.checkpoint_path is not None:

            if args.video_dir_name in model_dict:
                args.checkpoint_path = model_dict[args.video_dir_name]
            self.load(args.checkpoint_path)
            trainable_params = list(self.sim_fields.parameters()) + self.E_nu_list
            optim_list = [
                {"params": self.E_nu_list, "lr": args.lr * 1e-10},
                {
                    "params": self.sim_fields.parameters(),
                    "lr": args.lr,
                    "weight_decay": 1e-4,
                },
                # {"params": self.velo_fields.parameters(), "lr": args.lr * 1e-3, "weight_decay": 1e-4},
            ]

            if args.update_velo:
                self.freeze_velo = False
                velo_optim = [
                    {
                        "params": self.velo_fields.parameters(),
                        "lr": args.lr * 1e-4,
                        "weight_decay": 1e-4,
                    },
                ]
                self.velo_optimizer = torch.optim.AdamW(
                    velo_optim,
                    lr=args.lr,
                    weight_decay=0.0,
                )
                self.velo_scheduler = get_linear_schedule_with_warmup(
                    optimizer=self.velo_optimizer,
                    num_warmup_steps=args.warmup_step,
                    num_training_steps=args.train_iters,
                )
            else:
                self.freeze_velo = True
                self.velo_optimizer = None
        else:
            trainable_params = list(self.sim_fields.parameters()) + self.E_nu_list
            optim_list = [
                {"params": self.E_nu_list, "lr": args.lr * 1e-10},
                {
                    "params": self.sim_fields.parameters(),
                    "lr": args.lr,
                    "weight_decay": 1e-4,
                },
            ]
            self.freeze_velo = False
            self.window_size_schduler.warmup_step = 800

            velo_optim = [
                {
                    "params": self.velo_fields.parameters(),
                    "lr": args.lr,
                    "weight_decay": 1e-4,
                },
            ]
            self.velo_optimizer = torch.optim.AdamW(
                velo_optim,
                lr=args.lr,
                weight_decay=0.0,
            )
            self.velo_scheduler = get_linear_schedule_with_warmup(
                optimizer=self.velo_optimizer,
                num_warmup_steps=args.warmup_step,
                num_training_steps=args.train_iters // 3,
            )
            self.velo_optimizer, self.velo_scheduler = accelerator.prepare(
                self.velo_optimizer, self.velo_scheduler
            )

        self.optimizer = torch.optim.AdamW(
            optim_list,
            lr=args.lr,
            weight_decay=0.0,
        )
        self.trainable_params = trainable_params
        self.scheduler = get_linear_schedule_with_warmup(
            optimizer=self.optimizer,
            num_warmup_steps=args.warmup_step,
            num_training_steps=args.train_iters,
        )
        self.sim_fields, self.optimizer, self.scheduler = accelerator.prepare(
            self.sim_fields, self.optimizer, self.scheduler
        )
        self.velo_fields = accelerator.prepare(self.velo_fields)

        # setup train info
        self.step = 0
        self.batch_size = args.batch_size
        self.tv_loss_weight = args.tv_loss_weight

        self.log_iters = args.log_iters
        self.wandb_iters = args.wandb_iters
        self.max_grad_norm = args.max_grad_norm

        self.use_wandb = args.use_wandb
        if self.accelerator.is_main_process:
            if args.use_wandb:
                run = wandb.init(
                    config=dict(args),
                    dir=self.output_path,
                    **{
                        "mode": "online",
                        "entity": args.wandb_entity,
                        "project": args.wandb_project,
                    },
                )
                wandb.run.log_code(".")
                wandb.run.name = args.wandb_name
                print(f"run dir: {run.dir}")
                self.wandb_folder = run.dir
                os.makedirs(self.wandb_folder, exist_ok=True)

    def init_trainable_params(
        self,
    ):

        # init young modulus and poisson ratio

        young_numpy = np.exp(np.random.uniform(np.log(1e-3), np.log(1e3))).astype(
            np.float32
        )

        young_numpy = np.array([1e5]).astype(np.float32)

        young_modulus = torch.tensor(young_numpy, dtype=torch.float32).to(
            self.accelerator.device
        )

        poisson_numpy = np.random.uniform(0.1, 0.4)
        poisson_ratio = torch.tensor(poisson_numpy, dtype=torch.float32).to(
            self.accelerator.device
        )

        trainable_params = [young_modulus, poisson_ratio]

        print(
            "init young modulus: ",
            young_modulus.item(),
            "poisson ratio: ",
            poisson_ratio.item(),
        )
        return trainable_params

    def setup_simulation(self, dataset_dir, grid_size=100):

        device = "cuda:{}".format(self.accelerator.process_index)

        xyzs = self.render_params.gaussians.get_xyz.detach().clone()
        sim_xyzs = xyzs[self.sim_mask_in_raw_gaussian, :]
        sim_cov = (
            self.render_params.gaussians.get_covariance()[
                self.sim_mask_in_raw_gaussian, :
            ]
            .detach()
            .clone()
        )

        # scale, and shift
        pos_max = sim_xyzs.max()
        pos_min = sim_xyzs.min()
        scale = (pos_max - pos_min) * 1.8
        shift = -pos_min + (pos_max - pos_min) * 0.25
        self.scale, self.shift = scale, shift
        print("scale, shift", scale, shift)

        # filled
        filled_in_points_path = os.path.join(dataset_dir, "internal_filled_points.ply")

        if os.path.exists(filled_in_points_path):
            fill_xyzs = pcu.load_mesh_v(filled_in_points_path)  # [n, 3]
            fill_xyzs = fill_xyzs[
                np.random.choice(
                    fill_xyzs.shape[0], int(fill_xyzs.shape[0] * 0.25), replace=False
                )
            ]
            fill_xyzs = torch.from_numpy(fill_xyzs).float().to("cuda")
            self.fill_xyzs = fill_xyzs
            print(
                "loaded {} internal filled points from: ".format(fill_xyzs.shape[0]),
                filled_in_points_path,
            )
        else:
            self.fill_xyzs = None

        if self.fill_xyzs is not None:
            render_mask_in_sim_pts = torch.cat(
                [
                    torch.ones_like(sim_xyzs[:, 0]).bool(),
                    torch.zeros_like(fill_xyzs[:, 0]).bool(),
                ],
                dim=0,
            ).to(device)
            sim_xyzs = torch.cat([sim_xyzs, fill_xyzs], dim=0)
            sim_cov = torch.cat(
                [sim_cov, sim_cov.new_ones((fill_xyzs.shape[0], sim_cov.shape[-1]))],
                dim=0,
            )
            self.render_mask = render_mask_in_sim_pts
        else:
            self.render_mask = torch.ones_like(sim_xyzs[:, 0]).bool().to(device)

        sim_xyzs = (sim_xyzs + shift) / scale

        sim_aabb = torch.stack(
            [torch.min(sim_xyzs, dim=0)[0], torch.max(sim_xyzs, dim=0)[0]], dim=0
        )
        sim_aabb = (
            sim_aabb - torch.mean(sim_aabb, dim=0, keepdim=True)
        ) * 1.2 + torch.mean(sim_aabb, dim=0, keepdim=True)

        print("simulation aabb: ", sim_aabb)

        # point cloud resample with kmeans

        downsample_scale = self.args.downsample_scale
        num_cluster = int(sim_xyzs.shape[0] * downsample_scale)
        sim_xyzs = downsample_with_kmeans_gpu(sim_xyzs, num_cluster)

        sim_gaussian_pos = self.render_params.gaussians.get_xyz.detach().clone()[
            self.sim_mask_in_raw_gaussian, :
        ]
        sim_gaussian_pos = (sim_gaussian_pos + shift) / scale

        cdist = torch.cdist(sim_gaussian_pos, sim_xyzs) * -1.0
        _, top_k_index = torch.topk(cdist, self.args.top_k, dim=-1)
        self.top_k_index = top_k_index

        print("Downsampled to: ", sim_xyzs.shape[0], "by", downsample_scale)

        points_volume = get_volume(sim_xyzs.detach().cpu().numpy())

        num_particles = sim_xyzs.shape[0]

        sim_aabb = torch.stack(
            [torch.min(sim_xyzs, dim=0)[0], torch.max(sim_xyzs, dim=0)[0]], dim=0
        )
        sim_aabb = (
            sim_aabb - torch.mean(sim_aabb, dim=0, keepdim=True)
        ) * 1.2 + torch.mean(sim_aabb, dim=0, keepdim=True)

        print("simulation aabb: ", sim_aabb)

        wp.init()
        wp.config.mode = "debug"
        wp.config.verify_cuda = True

        mpm_state = MPMStateStruct()
        mpm_state.init(num_particles, device=device, requires_grad=True)

        self.particle_init_position = sim_xyzs.clone()

        mpm_state.from_torch(
            self.particle_init_position.clone(),
            torch.from_numpy(points_volume).float().to(device).clone(),
            sim_cov,
            device=device,
            requires_grad=True,
            n_grid=grid_size,
            grid_lim=1.0,
        )
        mpm_model = MPMModelStruct()
        mpm_model.init(num_particles, device=device, requires_grad=True)
        mpm_model.init_other_params(n_grid=grid_size, grid_lim=1.0, device=device)

        material_params = {
            "material": "jelly",  # "jelly", "metal", "sand", "foam", "snow", "plasticine", "neo-hookean"
            "g": [0.0, 0.0, 0.0],
            "density": 2000,  # kg / m^3
            "grid_v_damping_scale": 1.1,  # 0.999,
        }

        self.v_damping = material_params["grid_v_damping_scale"]
        self.material_name = material_params["material"]
        mpm_solver = MPMWARPDiff(
            num_particles, n_grid=grid_size, grid_lim=1.0, device=device
        )
        mpm_solver.set_parameters_dict(mpm_model, mpm_state, material_params)

        self.mpm_state, self.mpm_model, self.mpm_solver = (
            mpm_state,
            mpm_model,
            mpm_solver,
        )

        # setup boundary condition:
        moving_pts_path = os.path.join(dataset_dir, "moving_part_points.ply")
        if os.path.exists(moving_pts_path):
            moving_pts = pcu.load_mesh_v(moving_pts_path)
            moving_pts = torch.from_numpy(moving_pts).float().to(device)
            moving_pts = (moving_pts + shift) / scale
            freeze_mask = find_far_points(
                sim_xyzs, moving_pts, thres=0.5 / grid_size
            ).bool()
            freeze_pts = sim_xyzs[freeze_mask, :]

            grid_freeze_mask = apply_grid_bc_w_freeze_pts(
                grid_size, 1.0, freeze_pts, mpm_solver
            )
            self.freeze_mask = freeze_mask

            # does not prefer boundary condition on particle
            # freeze_mask_select = setup_boundary_condition_with_points(sim_xyzs, moving_pts,
            #                                                         self.mpm_solver, self.mpm_state, thres=0.5 / grid_size)
            # self.freeze_mask = freeze_mask_select.bool()
        else:
            raise NotImplementedError

        num_freeze_pts = self.freeze_mask.sum()
        print(
            "num freeze pts in total",
            num_freeze_pts.item(),
            "num moving pts",
            num_particles - num_freeze_pts.item(),
        )

        # init fields for simulation, e.g. density, external force, etc.

        # padd init density, youngs,
        density = (
            torch.ones_like(self.particle_init_position[..., 0])
            * material_params["density"]
        )
        youngs_modulus = (
            torch.ones_like(self.particle_init_position[..., 0])
            * self.E_nu_list[0].detach()
        )
        poisson_ratio = torch.ones_like(self.particle_init_position[..., 0]) * 0.3

        # load stem for higher density
        stem_pts_path = os.path.join(dataset_dir, "stem_points.ply")
        if os.path.exists(stem_pts_path):
            stem_pts = pcu.load_mesh_v(stem_pts_path)
            stem_pts = torch.from_numpy(stem_pts).float().to(device)
            stem_pts = (stem_pts + shift) / scale
            no_stem_mask = find_far_points(
                sim_xyzs, stem_pts, thres=2.0 / grid_size
            ).bool()
            stem_mask = torch.logical_not(no_stem_mask)
            density[stem_mask] = 2000
            print("num stem pts", stem_mask.sum().item())

        self.density = density
        self.young_modulus = youngs_modulus
        self.poisson_ratio = poisson_ratio

        # set density, youngs, poisson
        mpm_state.reset_density(
            density.clone(),
            torch.ones_like(density).type(torch.int),
            device,
            update_mass=True,
        )
        mpm_solver.set_E_nu_from_torch(
            mpm_model, youngs_modulus.clone(), poisson_ratio.clone(), device
        )
        mpm_solver.prepare_mu_lam(mpm_model, mpm_state, device)

        self.sim_fields = create_spatial_fields(self.args, 1, sim_aabb)
        self.sim_fields.train()

        self.args.sim_res = 24
        # self.velo_fields = create_velocity_model(self.args, sim_aabb)
        self.velo_fields = create_spatial_fields(
            self.args, 3, sim_aabb, add_entropy=False
        )
        self.velo_fields.train()

    def get_simulation_input(self, device):
        """
        Outs: All padded
            density: [N]
            young_modulus: [N]
            poisson_ratio: [N]
            velocity: [N, 3]
            query_mask: [N]
        """

        density, youngs_modulus, ret_poisson, entropy = self.get_material_params(device)
        initial_position_time0 = self.particle_init_position.clone()

        query_mask = torch.logical_not(self.freeze_mask)
        query_pts = initial_position_time0[query_mask, :]

        # velocity = self.velo_fields(torch.cat([query_pts, time_array.unsqueeze(-1)], dim=-1))[..., :3]
        velocity = self.velo_fields(query_pts)[..., :3]

        # scaling
        velocity = velocity * 0.1  # not padded yet
        ret_velocity = torch.zeros_like(initial_position_time0)
        ret_velocity[query_mask, :] = velocity

        # init F, and C

        I_mat = torch.eye(3, dtype=torch.float32).to(device)
        particle_F = torch.repeat_interleave(
            I_mat[None, ...], initial_position_time0.shape[0], dim=0
        )
        particle_C = torch.zeros_like(particle_F)

        return (
            density,
            youngs_modulus,
            ret_poisson,
            ret_velocity,
            query_mask,
            particle_F,
            particle_C,
            entropy,
        )

    def get_material_params(self, device):

        initial_position_time0 = self.particle_init_position.detach()

        # query_mask = torch.logical_not(self.freeze_mask)
        query_mask = torch.ones_like(self.freeze_mask).bool()
        query_pts = initial_position_time0[query_mask, :]
        if self.args.entropy_cls > 0:
            sim_params, entropy = self.sim_fields(query_pts)
        else:
            sim_params = self.sim_fields(query_pts)
            entropy = torch.zeros(1).to(sim_params.device)

        sim_params = sim_params * 1000
        # sim_params = torch.exp(self.sim_fields(query_pts))

        # density = sim_params[..., 0]

        youngs_modulus = self.young_modulus.detach().clone()
        youngs_modulus[query_mask] += sim_params[..., 0]

        # young_modulus = torch.exp(sim_params[..., 0]) + init_young
        youngs_modulus = torch.clamp(youngs_modulus, 1000.0, 5e8)

        density = self.density.detach().clone()
        # density[self.freeze_mask] = 100000
        ret_poisson = self.poisson_ratio.detach().clone()

        return density, youngs_modulus, ret_poisson, entropy

    def train_one_step(self):

        self.sim_fields.train()
        self.velo_fields.train()
        self.model.eval()
        accelerator = self.accelerator
        device = "cuda:{}".format(accelerator.process_index)
        data = next(self.dataloader)
        cam = data["cam"][0]

        gt_videos = data["video_clip"][0, 1 : self.num_frames, ...]

        window_size = int(self.window_size_schduler.compute_state(self.step)[0])
        stop_velo_opt_thres = 15
        do_velo_opt = not self.freeze_velo
        if not do_velo_opt:
            stop_velo_opt_thres = (
                0  # stop velocity optimization if we are loading from checkpoint
            )
            self.velo_fields.eval()

        rendered_video_list = []
        log_loss_dict = {
            "loss": [],
            "l2_loss": [],
            "psnr": [],
            "ssim": [],
            "entropy": [],
        }
        log_psnr_dict = {}

        particle_pos = self.particle_init_position.clone()
        # clean grid, stress, F, C and rest initial position
        self.mpm_state.reset_state(
            particle_pos.clone(),
            None,
            None,  # .clone(),
            device=device,
            requires_grad=True,
        )
        self.mpm_state.set_require_grad(True)

        (
            density,
            youngs_modulus,
            poisson,
            particle_velo,
            query_mask,
            particle_F,
            particle_C,
            entropy,
        ) = self.get_simulation_input(device)

        init_velo_mean = particle_velo[query_mask, :].mean().item()
        init_velo_max = particle_velo[query_mask, :].max().item()

        if not do_velo_opt:
            particle_velo = particle_velo.detach()
        # print("does do velo opt": do_velo_opt)

        num_particles = particle_pos.shape[0]

        delta_time = 1.0 / 30  # 30 fps
        substep_size = delta_time / self.args.substep
        num_substeps = int(delta_time / substep_size)

        checkpoint_steps = self.args.checkpoint_steps

        start_time_idx = max(0, window_size - self.args.compute_window)

        temporal_stride = self.args.stride

        if temporal_stride < 0 or temporal_stride > window_size:
            temporal_stride = window_size

        for start_time_idx in range(0, window_size, temporal_stride):

            end_time_idx = min(start_time_idx + temporal_stride, window_size)

            num_step_with_grad = num_substeps * (end_time_idx - start_time_idx)

            gt_frame = gt_videos[[end_time_idx - 1]]

            if start_time_idx != 0:
                density, youngs_modulus, poisson, entropy = self.get_material_params(
                    device
                )

            if checkpoint_steps > 0 and checkpoint_steps < num_step_with_grad:
                for time_step in range(0, num_step_with_grad, checkpoint_steps):
                    num_step = min(num_step_with_grad - time_step, checkpoint_steps)
                    if num_step == 0:
                        break
                    particle_pos, particle_velo, particle_F, particle_C = (
                        MPMDifferentiableSimulationWCheckpoint.apply(
                            self.mpm_solver,
                            self.mpm_state,
                            self.mpm_model,
                            substep_size,
                            num_step,
                            particle_pos,
                            particle_velo,
                            particle_F,
                            particle_C,
                            youngs_modulus,
                            self.E_nu_list[1],
                            density,
                            query_mask,
                            device,
                            True,
                            0,
                        )
                    )
            else:
                particle_pos, particle_velo, particle_F, particle_C, particle_cov = (
                    MPMDifferentiableSimulationClean.apply(
                        self.mpm_solver,
                        self.mpm_state,
                        self.mpm_model,
                        substep_size,
                        num_step_with_grad,
                        particle_pos,
                        particle_velo,
                        particle_F,
                        particle_C,
                        youngs_modulus,
                        self.E_nu_list[1],
                        density,
                        query_mask,
                        device,
                        True,
                        0,
                    )
                )

            # substep-3: render gaussian

            gaussian_pos = particle_pos * self.scale - self.shift
            undeformed_gaussian_pos = (
                self.particle_init_position * self.scale - self.shift
            )
            disp_offset = gaussian_pos - undeformed_gaussian_pos.detach()
            # gaussian_pos.requires_grad = True

            simulated_video = render_gaussian_seq_w_mask_with_disp(
                cam,
                self.render_params,
                undeformed_gaussian_pos.detach(),
                self.top_k_index,
                [disp_offset],
                self.sim_mask_in_raw_gaussian,
            )

            # print("debug", simulated_video.shape, gt_frame.shape, gaussian_pos.shape, init_xyzs.shape, density.shape, query_mask.sum().item())
            rendered_video_list.append(simulated_video.detach())

            l2_loss = 0.5 * F.mse_loss(simulated_video, gt_frame, reduction="mean")
            ssim_loss = compute_ssim(simulated_video, gt_frame)
            loss = l2_loss * (1.0 - self.ssim) + (1.0 - ssim_loss) * self.ssim

            loss = loss * (self.args.loss_decay**end_time_idx)
            sm_velo_loss = self.velo_fields.compute_smoothess_loss() * 10.0
            if not (do_velo_opt and start_time_idx == 0):
                sm_velo_loss = sm_velo_loss.detach()

            sm_spatial_loss = self.sim_fields.compute_smoothess_loss()

            sm_loss = (
                sm_velo_loss + sm_spatial_loss
            )  # typically 20 times larger than rendering loss

            loss = loss + sm_loss * self.tv_loss_weight
            loss = loss + entropy * self.args.entropy_reg
            loss = loss / self.args.compute_window
            loss.backward()

            # from IPython import embed; embed()
            # print(self.E_nu_list[1].grad)

            particle_pos, particle_velo, particle_F, particle_C = (
                particle_pos.detach(),
                particle_velo.detach(),
                particle_F.detach(),
                particle_C.detach(),
            )

            with torch.no_grad():
                psnr = compute_psnr(simulated_video, gt_frame).mean()
                log_loss_dict["loss"].append(loss.item())
                log_loss_dict["l2_loss"].append(l2_loss.item())
                log_loss_dict["psnr"].append(psnr.item())
                log_loss_dict["ssim"].append(ssim_loss.item())
                log_loss_dict["entropy"].append(entropy.item())

                print(
                    psnr.item(),
                    end_time_idx,
                    youngs_modulus.max().item(),
                    density.max().item(),
                )
                log_psnr_dict["psnr_frame_{}".format(end_time_idx)] = psnr.item()
                # print(psnr.item(), end_time_idx, youngs_modulus.max().item(), density.max().item())

        nu_grad_norm = self.E_nu_list[1].grad.norm(2).item()
        spatial_grad_norm = 0
        for p in self.sim_fields.parameters():
            if p.grad is not None:
                spatial_grad_norm += p.grad.norm(2).item()
        velo_grad_norm = 0
        for p in self.velo_fields.parameters():
            if p.grad is not None:
                velo_grad_norm += p.grad.norm(2).item()

        renderd_video = torch.cat(rendered_video_list, dim=0)
        renderd_video = torch.clamp(renderd_video, 0.0, 1.0)
        visual_video = (renderd_video.detach().cpu().numpy() * 255.0).astype(np.uint8)
        gt_video = (gt_videos.detach().cpu().numpy() * 255.0).astype(np.uint8)

        if (
            self.step % self.gradient_accumulation_steps == 0
            or self.step == (self.train_iters - 1)
            or (self.step % self.log_iters == self.log_iters - 1)
        ):

            torch.nn.utils.clip_grad_norm_(
                self.trainable_params,
                self.max_grad_norm,
                error_if_nonfinite=False,
            )  # error if nonfinite is false

            self.optimizer.step()
            self.optimizer.zero_grad()
            if do_velo_opt:
                assert self.velo_optimizer is not None
                torch.nn.utils.clip_grad_norm_(
                    self.velo_fields.parameters(),
                    self.max_grad_norm,
                    error_if_nonfinite=False,
                )  # error if nonfinite is false
                self.velo_optimizer.step()
                self.velo_optimizer.zero_grad()
                self.velo_scheduler.step()
            with torch.no_grad():
                self.E_nu_list[0].data.clamp_(1e-1, 1e8)
                self.E_nu_list[1].data.clamp_(1e-2, 0.449)
        self.scheduler.step()

        for k, v in log_loss_dict.items():
            log_loss_dict[k] = np.mean(v)

        print(log_loss_dict)
        print(
            "nu: ",
            self.E_nu_list[1].item(),
            nu_grad_norm,
            spatial_grad_norm,
            velo_grad_norm,
            "young_mean, max:",
            youngs_modulus.mean().item(),
            youngs_modulus.max().item(),
            do_velo_opt,
            "init_velo_mean:",
            init_velo_mean,
        )

        if accelerator.is_main_process and (self.step % self.wandb_iters == 0):
            with torch.no_grad():
                wandb_dict = {
                    "nu_grad_norm": nu_grad_norm,
                    "spatial_grad_norm": spatial_grad_norm,
                    "velo_grad_norm": velo_grad_norm,
                    "nu": self.E_nu_list[1].item(),
                    # "mean_density": density.mean().item(),
                    "mean_E": youngs_modulus.mean().item(),
                    "max_E": youngs_modulus.max().item(),
                    "min_E": youngs_modulus.min().item(),
                    "smoothness_loss": sm_loss.item(),
                    "window_size": window_size,
                    "max_particle_velo": particle_velo.max().item(),
                    "init_velo_mean": init_velo_mean,
                    "init_velo_max": init_velo_max,
                }

                wandb_dict.update(log_psnr_dict)
                simulated_video = self.inference(cam, substep=num_substeps)
                sim_video_torch = (
                    torch.from_numpy(simulated_video).float().to(device) / 255.0
                )
                gt_video_torch = torch.from_numpy(gt_video).float().to(device) / 255.0

                full_psnr = compute_psnr(sim_video_torch[1:], gt_video_torch)

                first_psnr = full_psnr[:6].mean().item()
                last_psnr = full_psnr[-6:].mean().item()
                full_psnr = full_psnr.mean().item()
                wandb_dict["full_psnr"] = full_psnr
                wandb_dict["first_psnr"] = first_psnr
                wandb_dict["last_psnr"] = last_psnr
                wandb_dict.update(log_loss_dict)

                # add young render

                youngs_norm = youngs_modulus - youngs_modulus.min() + 1e-2
                young_color = youngs_norm / torch.quantile(youngs_norm, 0.99)
                young_color = torch.clamp(young_color, 0.0, 1.0)
                young_color[self.freeze_mask] = 0.0
                queryed_young_color = young_color[self.top_k_index]  # [n_raw, topk]
                young_color = queryed_young_color.mean(dim=-1)

                young_color_full = torch.ones_like(
                    self.render_params.gaussians._xyz[:, 0]
                )

                young_color_full[self.sim_mask_in_raw_gaussian] = young_color
                young_color = torch.stack(
                    [young_color_full, young_color_full, young_color_full], dim=-1
                )

                young_img = render_feat_gaussian(
                    cam,
                    self.render_params.gaussians,
                    self.render_params.render_pipe,
                    self.render_params.bg_color,
                    young_color,
                )["render"]
                young_img = (
                    (young_img.detach().cpu().numpy() * 255.0)
                    .astype(np.uint8)
                    .transpose(1, 2, 0)
                )
                wandb_dict["young_img"] = wandb.Image(young_img)

                if self.step % int(10 * self.wandb_iters) == 0:

                    wandb_dict["rendered_video"] = wandb.Video(
                        visual_video, fps=visual_video.shape[0]
                    )

                    wandb_dict["gt_video"] = wandb.Video(
                        gt_video,
                        fps=gt_video.shape[0],
                    )

                    wandb_dict["inference_video"] = wandb.Video(
                        simulated_video,
                        fps=simulated_video.shape[0],
                    )

                    simulated_video = self.inference(
                        cam, velo_scaling=5.0, num_sec=3, substep=num_substeps
                    )
                    wandb_dict["inference_video_v5_t3"] = wandb.Video(
                        simulated_video,
                        fps=30,
                    )

                if self.use_wandb:
                    wandb.log(wandb_dict, step=self.step)

        self.accelerator.wait_for_everyone()

    def train(self):
        # might remove tqdm when multiple node
        for index in tqdm(range(self.step, self.train_iters), desc="Training progress"):
            self.train_one_step()
            if self.step % self.log_iters == self.log_iters - 1:
                if self.accelerator.is_main_process:
                    self.save()
                    # self.test()
            # self.accelerator.wait_for_everyone()
            self.step += 1
        if self.accelerator.is_main_process:
            self.save()

    @torch.no_grad()
    def inference(
        self,
        cam,
        velo_scaling=1.0,
        num_sec=1,
        nu=None,
        young_scaling=1.0,
        substep=64,
        youngs_modulus=None,
    ):

        self.sim_fields.eval()
        self.velo_fields.eval()

        device = "cuda:{}".format(self.accelerator.process_index)

        (
            density,
            youngs_modulus_,
            poisson,
            init_velocity,
            query_mask,
            particle_F,
            particle_C,
            entropy,
        ) = self.get_simulation_input(device)

        poisson = self.E_nu_list[1].detach().clone()  # override poisson

        if youngs_modulus is None:
            youngs_modulus = youngs_modulus_ * young_scaling
        init_xyzs = self.particle_init_position.clone()

        init_velocity[query_mask, :] = init_velocity[query_mask, :] * velo_scaling

        num_particles = init_xyzs.shape[0]

        # delta_time = 1.0 / (self.num_frames - 1)
        delta_time = 1.0 / 30  # 30 fps
        substep_size = delta_time / substep
        num_substeps = int(delta_time / substep_size)
        # reset state

        self.mpm_state.reset_density(
            density.clone(), query_mask, device, update_mass=True
        )
        self.mpm_solver.set_E_nu_from_torch(
            self.mpm_model, youngs_modulus.clone(), poisson.clone(), device
        )
        self.mpm_solver.prepare_mu_lam(self.mpm_model, self.mpm_state, device)

        self.mpm_state.continue_from_torch(
            init_xyzs,
            init_velocity,
            particle_F,
            particle_C,
            device=device,
            requires_grad=False,
        )

        pos_list = [self.particle_init_position.clone() * self.scale - self.shift]

        prev_state = self.mpm_state
        for i in tqdm(range((self.num_frames - 1) * num_sec)):
            # for substep in range(num_substeps):
            #     self.mpm_solver.p2g2p(self.mpm_model, self.mpm_state, substep, substep_size, device="cuda:0")
            # pos = wp.to_torch(self.mpm_state.particle_x).clone()

            for substep_local in range(num_substeps):
                next_state = prev_state.partial_clone(requires_grad=False)
                self.mpm_solver.p2g2p_differentiable(
                    self.mpm_model, prev_state, next_state, substep_size, device=device
                )
                prev_state = next_state

            pos = wp.to_torch(next_state.particle_x).clone()
            pos = (pos * self.scale) - self.shift
            pos_list.append(pos)

        init_pos = pos_list[0].clone()
        pos_diff_list = [_ - init_pos for _ in pos_list]

        video_array = render_gaussian_seq_w_mask_with_disp(
            cam,
            self.render_params,
            init_pos,
            self.top_k_index,
            pos_diff_list,
            self.sim_mask_in_raw_gaussian,
        )

        video_numpy = video_array.detach().cpu().numpy() * 255
        video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)

        return video_numpy

    def save(
        self,
    ):
        # training states
        output_path = os.path.join(
            self.output_path, f"checkpoint_model_{self.step:06d}"
        )
        os.makedirs(output_path, exist_ok=True)

        name_list = [
            "velo_fields",
            "sim_fields",
        ]
        for i, model in enumerate(
            [
                self.accelerator.unwrap_model(self.velo_fields, keep_fp32_wrapper=True),
                self.accelerator.unwrap_model(self.sim_fields, keep_fp32_wrapper=True),
            ]
        ):
            model_name = name_list[i]
            model_path = os.path.join(output_path, model_name + ".pt")
            torch.save(model.state_dict(), model_path)

    def load(self, checkpoint_dir):
        name_list = [
            "velo_fields",
            "sim_fields",
        ]
        for i, model in enumerate([self.velo_fields, self.sim_fields]):
            model_name = name_list[i]
            if model_name == "sim_fields" and (not self.args.load_sim):
                continue
            model_path = os.path.join(checkpoint_dir, model_name + ".pt")
            print("=> loading: ", model_path)
            model.load_state_dict(torch.load(model_path))

    def setup_eval(self, args, gaussian_path, white_background=True):
        # setup gaussians
        class RenderPipe(NamedTuple):
            convert_SHs_python = False
            compute_cov3D_python = False
            debug = False

        class RenderParams(NamedTuple):
            render_pipe: RenderPipe
            bg_color: bool
            gaussians: GaussianModel
            camera_list: list

        gaussians = GaussianModel(3)
        camera_list = self.dataset.test_camera_list

        gaussians.load_ply(gaussian_path)
        gaussians.detach_grad()
        print(
            "load gaussians from: {}".format(gaussian_path),
            "... num gaussians: ",
            gaussians._xyz.shape[0],
        )
        bg_color = [1, 1, 1] if white_background else [0, 0, 0]
        background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
        render_pipe = RenderPipe()

        render_params = RenderParams(
            render_pipe=render_pipe,
            bg_color=background,
            gaussians=gaussians,
            camera_list=camera_list,
        )
        self.render_params = render_params

        # get_gaussian scene box
        scaler = 1.1
        points = gaussians._xyz

        min_xyz = torch.min(points, dim=0)[0]
        max_xyz = torch.max(points, dim=0)[0]

        center = (min_xyz + max_xyz) / 2

        scaled_min_xyz = (min_xyz - center) * scaler + center
        scaled_max_xyz = (max_xyz - center) * scaler + center

        aabb = torch.stack([scaled_min_xyz, scaled_max_xyz], dim=0)

        # add filled in points
        gaussian_dir = os.path.dirname(gaussian_path)

        clean_points_path = os.path.join(gaussian_dir, "clean_object_points.ply")
        if os.path.exists(clean_points_path):
            clean_xyzs = pcu.load_mesh_v(clean_points_path)
            clean_xyzs = torch.from_numpy(clean_xyzs).float().to("cuda")
            self.clean_xyzs = clean_xyzs
            print(
                "loaded {} clean points from: ".format(clean_xyzs.shape[0]),
                clean_points_path,
            )
            # we can use tight threshold here
            not_sim_maks = find_far_points(
                gaussians._xyz, clean_xyzs, thres=0.01
            ).bool()
            sim_mask_in_raw_gaussian = torch.logical_not(not_sim_maks)
            # [N]
            self.sim_mask_in_raw_gaussian = sim_mask_in_raw_gaussian
        else:
            self.clean_xyzs = None
            self.sim_mask_in_raw_gaussian = torch.ones_like(gaussians._xyz[:, 0]).bool()

        return aabb

    def demo(
        self,
        velo_scaling=5.0,
        num_sec=8.0,
        eval_ys=1.0,
        static_camera=False,
        save_name="demo_3sec",
    ):

        result_dir = "output/alocasia/results"
        pos_path = os.path.join(result_dir, save_name + "_pos.npy")

        if os.path.exists(pos_path):
            pos_array = np.load(pos_path)
        else:
            pos_array = None
        pos_array = None
        accelerator = self.accelerator
        data = next(self.dataloader)
        cam = data["cam"][0]

        for i in range(10):
            next_data = next(self.test_dataloader)
        next_cam = next_data["cam"][0]

        substep = self.args.substep  # 1e-4

        youngs_modulus = None

        self.sim_fields.eval()
        self.velo_fields.eval()

        device = "cuda:{}".format(self.accelerator.process_index)

        (
            density,
            youngs_modulus_,
            poisson,
            init_velocity,
            query_mask,
            particle_F,
            particle_C,
            entropy,
        ) = self.get_simulation_input(device)

        poisson = self.E_nu_list[1].detach().clone()  # override poisson

        if eval_ys < 10:
            youngs_modulus = youngs_modulus_
        else:
            youngs_modulus = torch.ones_like(youngs_modulus_) * eval_ys

        # from IPython import embed; embed()

        if pos_array is None:
            init_xyzs = self.particle_init_position.clone()

            init_velocity[query_mask, :] = init_velocity[query_mask, :] * velo_scaling

            num_particles = init_xyzs.shape[0]

            # delta_time = 1.0 / (self.num_frames - 1)
            delta_time = 1.0 / 30  # 30 fps
            substep_size = delta_time / substep
            num_substeps = int(delta_time / substep_size)
            # reset state

            self.mpm_state.reset_density(
                density.clone(), query_mask, device, update_mass=True
            )
            self.mpm_solver.set_E_nu_from_torch(
                self.mpm_model, youngs_modulus.clone(), poisson.clone(), device
            )
            self.mpm_solver.prepare_mu_lam(self.mpm_model, self.mpm_state, device)

            self.mpm_state.continue_from_torch(
                init_xyzs,
                init_velocity,
                particle_F,
                particle_C,
                device=device,
                requires_grad=False,
            )

            pos_list = [self.particle_init_position.clone() * self.scale - self.shift]

            prev_state = self.mpm_state
            for i in tqdm(range(int((self.num_frames - 1) * num_sec))):
                # for substep in range(num_substeps):
                #     self.mpm_solver.p2g2p(self.mpm_model, self.mpm_state, substep, substep_size, device="cuda:0")
                # pos = wp.to_torch(self.mpm_state.particle_x).clone()

                for substep_local in range(num_substeps):
                    next_state = prev_state.partial_clone(requires_grad=False)
                    self.mpm_solver.p2g2p_differentiable(
                        self.mpm_model,
                        prev_state,
                        next_state,
                        substep_size,
                        device=device,
                    )
                    prev_state = next_state

                pos = wp.to_torch(next_state.particle_x).clone()
                pos = (pos * self.scale) - self.shift
                pos_list.append(pos)

            numpy_pos = torch.stack(pos_list, dim=0).detach().cpu().numpy()

            np.save(pos_path, numpy_pos)
        else:
            pos_list = []
            for i in range(pos_array.shape[0]):
                pos = pos_array[i, ...]
                pos_list.append(torch.from_numpy(pos).to(device))

        init_pos = pos_list[0].clone()
        pos_diff_list = [_ - init_pos for _ in pos_list]

        video_array = render_gaussian_seq_w_mask_with_disp(
            cam,
            self.render_params,
            init_pos,
            self.top_k_index,
            pos_diff_list,
            self.sim_mask_in_raw_gaussian,
        )

        video_numpy = video_array.detach().cpu().numpy() * 255
        video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)
        video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])

        from motionrep.utils.io_utils import save_video_imageio, save_gif_imageio

        save_path = os.path.join(
            save_name
            + "_jelly_video_substep_{}_grid_{}_evalys_{}".format(
                substep, self.args.grid_size, eval_ys
            )
            + ".gif"
        )
        print("save video to ", save_path)
        save_gif_imageio(save_path, video_numpy, fps=30)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default="config.yml")

    # dataset params
    parser.add_argument(
        "--dataset_dir",
        type=str,
        default="../../data/physics_dreamer/hat_nerfstudio/",
    )
    parser.add_argument("--video_dir_name", type=str, default="videos")
    parser.add_argument(
        "--dataset_res",
        type=str,
        default="large",  # ["middle", "small", "large"]
    )
    parser.add_argument(
        "--motion_model_path",
        type=str,
        default=None,  # not used
        help="path to load the pretrained motion model from",
    )

    parser.add_argument("--model", type=str, default="se3_field")
    parser.add_argument("--feat_dim", type=int, default=64)
    parser.add_argument("--num_decoder_layers", type=int, default=3)
    parser.add_argument("--decoder_hidden_size", type=int, default=64)
    parser.add_argument("--spatial_res", type=int, default=32)
    parser.add_argument("--zero_init", type=bool, default=True)

    parser.add_argument("--entropy_cls", type=int, default=-1)
    parser.add_argument("--entropy_reg", type=float, default=1e-2)

    parser.add_argument("--num_frames", type=str, default=14)

    parser.add_argument("--grid_size", type=int, default=64)
    parser.add_argument("--sim_res", type=int, default=8)
    parser.add_argument("--sim_output_dim", type=int, default=1)
    parser.add_argument("--substep", type=int, default=768)
    parser.add_argument("--loss_decay", type=float, default=1.0)
    parser.add_argument("--start_window_size", type=int, default=6)
    parser.add_argument("--compute_window", type=int, default=1)
    parser.add_argument("--grad_window", type=int, default=14)
    # -1 means no gradient checkpointing
    parser.add_argument("--checkpoint_steps", type=int, default=-1)
    parser.add_argument("--stride", type=int, default=1)

    parser.add_argument("--downsample_scale", type=float, default=0.04)
    parser.add_argument("--top_k", type=int, default=8)

    # loss parameters
    parser.add_argument("--tv_loss_weight", type=float, default=1e-4)
    parser.add_argument("--ssim", type=float, default=0.9)

    # Logging and checkpointing
    parser.add_argument("--output_dir", type=str, default="../../output/inverse_sim")
    parser.add_argument("--log_iters", type=int, default=10)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument(
        "--checkpoint_path",
        type=str,
        # psnr 29.0
        default="../../output/inverse_sim/fast_alocasia_velopretrain_cleandecay_1.0_substep_96_se3_field_lr_0.01_tv_0.01_iters_300_sw_2_cw_2/seed0/checkpoint_model_000299",
        help="path to load velocity pretrain checkpoint from",
    )
    # training parameters
    parser.add_argument("--train_iters", type=int, default=200)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--max_grad_norm", type=float, default=1.0)
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
    )

    # wandb parameters
    parser.add_argument("--use_wandb", action="store_true", default=False)
    parser.add_argument("--wandb_entity", type=str, default="mit-cv")
    parser.add_argument("--wandb_project", type=str, default="inverse_sim")
    parser.add_argument("--wandb_iters", type=int, default=10)
    parser.add_argument("--wandb_name", type=str, required=True)
    parser.add_argument("--run_eval", action="store_true", default=False)
    parser.add_argument("--load_sim", action="store_true", default=False)
    parser.add_argument("--test_convergence", action="store_true", default=False)
    parser.add_argument("--update_velo", action="store_true", default=False)
    parser.add_argument("--eval_iters", type=int, default=8)
    parser.add_argument("--eval_ys", type=float, default=1e6)
    parser.add_argument("--demo_name", type=str, default="demo_3sec_sv_gres48_lr1e-2")
    parser.add_argument("--velo_scaling", type=float, default=5.0)

    # distributed training args
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="For distributed training: local_rank",
    )

    args, extra_args = parser.parse_known_args()
    cfg = create_config(args.config, args, extra_args)

    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank
    print(args.local_rank, "local rank")

    return cfg


if __name__ == "__main__":
    args = parse_args()

    # torch.backends.cuda.matmul.allow_tf32 = True

    trainer = Trainer(args)

    if args.run_eval:
        trainer.demo(
            velo_scaling=args.velo_scaling,
            eval_ys=args.eval_ys,
            save_name=args.demo_name,
        )
    else:
        # trainer.debug()
        trainer.train()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/_convert_fbx_to_mesh.py
================================================
import bpy
import numpy as np
import sys
import point_cloud_utils as pcu
import os


def convert(fbx_path):
    bpy.ops.wm.read_factory_settings(use_empty=True)
    # 1. Import the FBX file
    bpy.ops.import_scene.fbx(filepath=fbx_path)
    print("loaded fbx from: ", fbx_path)

    # Assuming the FBX file has one main mesh object, get it
    mesh_obj = bpy.context.selected_objects[1]

    # 2. Duplicate the mesh for the first frame
    bpy.context.view_layer.objects.active = mesh_obj
    mesh_obj.select_set(True)
    bpy.ops.object.duplicate()
    static_mesh = bpy.context.object

    # Apply the first frame's pose to the static mesh
    bpy.context.scene.frame_set(1)
    bpy.ops.object.modifier_apply({"object": static_mesh}, modifier="Armature")

    # 3. Calculate and store vertex offsets for each subsequent frame
    vertex_traj_list = []
    num_frames = bpy.context.scene.frame_end

    for frame in range(1, num_frames + 1):
        bpy.context.scene.frame_set(frame)

        bpy.context.view_layer.update()
        # Update the mesh to the current frame's pose
        # mesh_obj.data.update()

        all_pts_3d = []
        for v1, v2 in zip(static_mesh.data.vertices, mesh_obj.data.vertices):
            pts_3d = v2.co
            all_pts_3d.append(pts_3d)

        vertex_traj_list.append(np.array(all_pts_3d))

    vertex_traj_list = np.stack(vertex_traj_list, axis=0)

    # Now, frame_offsets contains the vertex offsets for each frame
    vertex_array = vertex_traj_list[0]  # first frame

    # get face indx
    bpy.context.view_layer.objects.active = static_mesh
    bpy.ops.object.mode_set(mode="EDIT")
    bpy.ops.mesh.select_all(action="SELECT")
    bpy.ops.mesh.quads_convert_to_tris(quad_method="BEAUTY", ngon_method="BEAUTY")
    bpy.ops.object.mode_set(mode="OBJECT")
    faces_list = [list(face.vertices) for face in static_mesh.data.polygons]

    faces_array = np.array(faces_list, dtype=np.int32)
    vertices = np.array([v.co for v in static_mesh.data.vertices])
    print("vertices shape: ", vertices.shape)

    print(
        "num_frames: ",
        num_frames,
        "offsets shape",
        vertex_traj_list.shape,
        "num_faces",
        faces_array.shape,
        "max offset",
        np.max(vertex_traj_list - vertex_array[np.newaxis, :, :]),
        np.min(vertex_traj_list - vertex_array[np.newaxis, :, :]),
    )

    mean = np.mean(vertices, axis=0)
    max_range = np.max(np.max(vertices, axis=0) - np.min(vertices, axis=0))
    print("max_range: ", max_range, "mean: ", mean)

    # normalize
    # vertex_array = (vertex_array - mean[np.newaxis, :]) / max_range
    # vertex_traj_list = (vertex_traj_list - mean[np.newaxis, np.newaxis, :]) / max_range

    return faces_array, vertex_array, vertex_traj_list


def convert2(fbx_path):
    bpy.ops.import_scene.fbx(filepath=fbx_path)

    # Assuming the imported object is the active object
    obj = bpy.context.active_object
    for obj in bpy.context.selected_objects:
        print("obj: ", obj.name, obj.type)
    mesh_objects = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]

    # Ensure it's in object mode
    bpy.ops.object.mode_set(mode="OBJECT")

    # Get the total number of frames in the scene
    start_frame = bpy.context.scene.frame_start
    end_frame = bpy.context.scene.frame_end

    # Create a dictionary to store vertex positions for each frame
    vertex_data_list = []

    # Get the dependency graph
    depsgraph = bpy.context.evaluated_depsgraph_get()

    # Iterate over each frame
    for frame in range(start_frame, end_frame + 1):
        bpy.context.scene.frame_set(frame)

        # Update the scene to reflect changes
        bpy.context.view_layer.update()

        ret_list = []
        for mesh_obj in mesh_objects:
            # deformed_mesh = mesh_obj.to_mesh()
            # Extract vertex positions for the current frame
            # vertex_positions = [vertex.co for vertex in deformed_mesh.vertices]
            # vertex_positions = [vertex.co.copy() for vertex in deformed_mesh.vertices]

            duplicated_obj = mesh_obj.copy()
            duplicated_obj.data = mesh_obj.data.copy()
            bpy.context.collection.objects.link(duplicated_obj)

            # Make the duplicated object the active object
            bpy.context.view_layer.objects.active = duplicated_obj
            duplicated_obj.select_set(True)
            print("duplicated_obj.modifiers", duplicated_obj.modifiers)

            for mod in duplicated_obj.modifiers:
                bpy.ops.object.modifier_apply(
                    {"object": duplicated_obj}, modifier=mod.name
                )

            # if "Armature" in duplicated_obj.modifiers:
            #     bpy.ops.object.modifier_apply(
            #         {"object": duplicated_obj}, modifier="Armature"
            #     )

            # Extract vertex positions from the duplicated object
            vertex_positions = [
                vertex.co.copy() for vertex in duplicated_obj.data.vertices
            ]

            ret_list += vertex_positions

        # Convert to numpy array and store in the dictionary
        vertex_data_list.append(np.array(ret_list))

    vertex_traj_list = np.stack(vertex_data_list, axis=0)
    print(
        "offsets shape",
        vertex_traj_list.shape,
        "max offset",
        np.max(vertex_traj_list - vertex_traj_list[0:1, :, :]),
        np.min(vertex_traj_list - vertex_traj_list[0:1, :, :]),
    )

    # bpy.ops.object.mode_set(mode="EDIT")
    # bpy.ops.mesh.select_all(action="SELECT")
    # bpy.ops.mesh.quads_convert_to_tris(quad_method="BEAUTY", ngon_method="BEAUTY")
    # bpy.ops.object.mode_set(mode="OBJECT")
    if bpy.context.active_object.type == "MESH":
        obj = bpy.context.active_object

        # Set the mode to 'EDIT'
        bpy.ops.object.mode_set(mode="EDIT")

        # Ensure the mesh is the active object and is in edit mode
        if bpy.context.mode == "EDIT_MESH" and bpy.context.object == obj:
            bpy.ops.mesh.select_all(action="SELECT")
            bpy.ops.mesh.quads_convert_to_tris(
                quad_method="BEAUTY", ngon_method="BEAUTY"
            )
            bpy.ops.object.mode_set(mode="OBJECT")
        else:
            print("Failed to set the correct context.")
    else:
        print("Active object is not a mesh.")
    faces_list = []
    for mesh_obj in mesh_objects:
        _fl = [list(face.vertices) for face in obj.data.polygons]
        faces_list += _fl

    faces_array = np.array(faces_list, dtype=np.int32)
    vertex_array = vertex_traj_list[0]  # first frame
    print("face shape", faces_array.shape)
    return faces_array, vertex_array, vertex_traj_list


def main():
    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    fbx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    # num_frames = int(argv[2])

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    faces_array, vertex_array, vertex_traj_array = convert2(fbx_path)

    save_mesh_path = os.path.join(output_dir, "mesh0.obj")
    pcu.save_mesh_vf(save_mesh_path, vertex_array, faces_array)

    save_traj_path = os.path.join(output_dir, "traj.npy")
    np.save(save_traj_path, vertex_traj_array)


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/blender_deforming_things4d.py
================================================
import sys
import numpy
import os

import PIL
import mathutils


def anime_read(filename):
    """
    filename: path of .anime file
    return:
        nf: number of frames in the animation
        nv: number of vertices in the mesh (mesh topology fixed through frames)
        nt: number of triangle face in the mesh
        vert_data: vertice data of the 1st frame (3D positions in x-y-z-order)
        face_data: riangle face data of the 1st frame
        offset_data: 3D offset data from the 2nd to the last frame
    """
    f = open(filename, "rb")
    nf = np.fromfile(f, dtype=np.int32, count=1)[0]
    nv = np.fromfile(f, dtype=np.int32, count=1)[0]
    nt = np.fromfile(f, dtype=np.int32, count=1)[0]
    vert_data = np.fromfile(f, dtype=np.float32, count=nv * 3)
    face_data = np.fromfile(f, dtype=np.int32, count=nt * 3)
    offset_data = np.fromfile(f, dtype=np.float32, count=-1)
    """check data consistency"""
    if len(offset_data) != (nf - 1) * nv * 3:
        raise ("data inconsistent error!", filename)
    vert_data = vert_data.reshape((-1, 3))
    face_data = face_data.reshape((-1, 3))
    offset_data = offset_data.reshape((nf - 1, nv, 3))
    return nf, nv, nt, vert_data, face_data, offset_data


def 


argv = sys.argv
argv = argv[argv.index("--") + 1 :]  # get all args after "--"
print(argv)  # ['arg1', 'arg2', 'arg3']


# for package install
# import site
# import pip
# pip.main(["install", "Pillow", "--target", site.USER_SITE])


================================================
FILE: projects/uncleaned_train/motionrep/datatools/blender_install_packages.py
================================================
import site
import pip

# pip.main(["install", "point-cloud-utils", "--target", site.USER_SITE])


================================================
FILE: projects/uncleaned_train/motionrep/datatools/blender_render_imgs.py
================================================
import bpy
import os
import numpy as np
import math
import sys
import struct
import collections
from mathutils import Matrix, Quaternion
from scipy.spatial.transform import Rotation


def focal2fov(focal, pixels):
    return 2 * math.atan(pixels / (2 * focal))


def create_camera(location, rotation):
    # Create a new camera
    bpy.ops.object.camera_add(location=location, rotation=rotation)
    return bpy.context.active_object


def set_camera_look_at(camera, target_point):
    # Compute the direction vector from the camera to the target point
    direction = target_point - camera.location
    # Compute the rotation matrix to align the camera's -Z axis to this direction
    rot_quat = direction.to_track_quat("-Z", "Y")
    camera.rotation_euler = rot_quat.to_euler()

    return rot_quat


def setup_alpha_mask(obj_name, pass_index=1):
    # Set the object's pass index
    obj = bpy.data.objects[obj_name]
    obj.pass_index = pass_index

    # Enable the Object Index pass for the active render layer
    bpy.context.view_layer.use_pass_object_index = True

    # Enable 'Use Nodes':
    bpy.context.scene.use_nodes = True
    tree = bpy.context.scene.node_tree

    # Clear default nodes
    for node in tree.nodes:
        tree.nodes.remove(node)

    # Add Render Layers node
    render_layers = tree.nodes.new("CompositorNodeRLayers")

    # Add Composite node (output)
    composite = tree.nodes.new("CompositorNodeComposite")

    # Add ID Mask node
    id_mask = tree.nodes.new("CompositorNodeIDMask")
    id_mask.index = pass_index

    # Add Set Alpha node
    set_alpha = tree.nodes.new("CompositorNodeSetAlpha")

    # Connect nodes
    tree.links.new(render_layers.outputs["Image"], set_alpha.inputs["Image"])
    tree.links.new(render_layers.outputs["IndexOB"], id_mask.inputs[0])
    tree.links.new(id_mask.outputs[0], set_alpha.inputs["Alpha"])
    tree.links.new(set_alpha.outputs["Image"], composite.inputs["Image"])


def render_scene(camera, output_path):
    bpy.context.scene.render.film_transparent = True

    setup_alpha_mask("MyMeshObject", 1)
    # Set the active camera
    bpy.context.scene.render.image_settings.color_mode = "RGBA"

    bpy.context.scene.camera = camera

    # Set the output path for the render
    bpy.context.scene.render.filepath = output_path

    # Render the scene
    bpy.ops.render.render(write_still=True)


def setup_light():
    # Add first directional light (Sun lamp)
    light_data_1 = bpy.data.lights.new(name="Directional_Light_1", type="SUN")
    light_data_1.energy = 3  # Adjust energy as needed
    light_1 = bpy.data.objects.new(name="Directional_Light_1", object_data=light_data_1)
    bpy.context.collection.objects.link(light_1)
    light_1.location = (10, 10, 10)  # Adjust location as needed
    light_1.rotation_euler = (
        np.radians(45),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_2 = bpy.data.lights.new(name="Directional_Light_2", type="SUN")
    light_data_2.energy = 5  # Adjust energy as needed
    light_2 = bpy.data.objects.new(name="Directional_Light_2", object_data=light_data_2)
    bpy.context.collection.objects.link(light_2)
    light_2.location = (10, -10, 10)  # Adjust location as needed
    light_2.rotation_euler = (
        np.radians(45),
        np.radians(180),
        np.radians(45),
    )  # Adjust rotation for direction


def create_mesh_from_data(vertices, faces):
    # Clear existing mesh objects in the scene
    bpy.ops.object.select_all(action="DESELECT")
    bpy.ops.object.select_by_type(type="MESH")
    bpy.ops.object.delete()

    vertices_list = vertices.tolist()
    faces_list = faces.tolist()

    # Create a new mesh
    mesh_name = "MyMesh"
    mesh = bpy.data.meshes.new(name=mesh_name)
    obj = bpy.data.objects.new("MyMeshObject", mesh)

    # Link it to the scene
    bpy.context.collection.objects.link(obj)
    bpy.context.view_layer.objects.active = obj
    obj.select_set(True)

    # Load the mesh data
    mesh.from_pydata(vertices_list, [], faces_list)
    mesh.update()

    # mesh_data = bpy.data.meshes.new(mesh_name)
    # mesh_data.from_pydata(vertices_list, [], faces_list)
    # mesh_data.update()
    # the_mesh = bpy.data.objects.new(mesh_name, mesh_data)
    # the_mesh.data.vertex_colors.new()  # init color
    # bpy.context.collection.objects.link(the_mesh)

    # UV unwrap the mesh
    bpy.ops.object.select_all(action="DESELECT")
    obj.select_set(True)
    bpy.context.view_layer.objects.active = obj
    bpy.ops.object.mode_set(mode="EDIT")
    bpy.ops.mesh.select_all(action="SELECT")
    bpy.ops.uv.smart_project()
    bpy.ops.object.mode_set(mode="OBJECT")

    # Texture the mesh based on its normals
    mat = bpy.data.materials.new(name="NormalMaterial")
    mat.use_nodes = True
    bsdf = mat.node_tree.nodes["Principled BSDF"]
    normal_node = mat.node_tree.nodes.new(type="ShaderNodeNormal")
    geometry = mat.node_tree.nodes.new(type="ShaderNodeNewGeometry")

    # mat.node_tree.links.new(geometry.outputs["Normal"], normal_node.inputs["Normal"])
    # mat.node_tree.links.new(normal_node.outputs["Dot"], bsdf.inputs["Base Color"])
    mat.node_tree.links.new(geometry.outputs["Normal"], bsdf.inputs["Base Color"])

    obj.data.materials.append(mat)

    return None


CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"]
)
Camera = collections.namedtuple("Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]
)
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]
)


CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12),
}
CAMERA_MODEL_IDS = dict(
    [(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]
)
CAMERA_MODEL_NAMES = dict(
    [(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]
)


def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
    """pack and write to a binary file.
    :param fid:
    :param data: data to send, if multiple elements are sent at the same time,
    they should be encapsuled either in a list or a tuple
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    should be the same length as the data list or tuple
    :param endian_character: Any of {@, =, <, >, !}
    """
    if isinstance(data, (list, tuple)):
        bytes = struct.pack(endian_character + format_char_sequence, *data)
    else:
        bytes = struct.pack(endian_character + format_char_sequence, data)
    fid.write(bytes)


def write_cameras_binary(cameras, path_to_model_file):
    """
    see: src/colmap/scene/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    with open(path_to_model_file, "wb") as fid:
        write_next_bytes(fid, len(cameras), "Q")
        for _, cam in cameras.items():
            model_id = CAMERA_MODEL_NAMES[cam.model].model_id
            camera_properties = [cam.id, model_id, cam.width, cam.height]
            write_next_bytes(fid, camera_properties, "iiQQ")
            for p in cam.params:
                write_next_bytes(fid, float(p), "d")
    return cameras


def write_images_binary(images, path_to_model_file):
    """
    see: src/colmap/scene/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    with open(path_to_model_file, "wb") as fid:
        write_next_bytes(fid, len(images), "Q")
        for _, img in images.items():
            write_next_bytes(fid, img.id, "i")
            write_next_bytes(fid, img.qvec.tolist(), "dddd")
            write_next_bytes(fid, img.tvec.tolist(), "ddd")
            write_next_bytes(fid, img.camera_id, "i")
            for char in img.name:
                write_next_bytes(fid, char.encode("utf-8"), "c")
            write_next_bytes(fid, b"\x00", "c")
            write_next_bytes(fid, len(img.point3D_ids), "Q")
            for xy, p3d_id in zip(img.xys, img.point3D_ids):
                write_next_bytes(fid, [*xy, p3d_id], "ddq")


def write_points3D_binary(points3D, path_to_model_file):
    """
    see: src/colmap/scene/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """
    with open(path_to_model_file, "wb") as fid:
        write_next_bytes(fid, len(points3D), "Q")
        for _, pt in points3D.items():
            write_next_bytes(fid, pt.id, "Q")
            write_next_bytes(fid, pt.xyz.tolist(), "ddd")
            write_next_bytes(fid, pt.rgb.tolist(), "BBB")
            write_next_bytes(fid, pt.error, "d")
            track_length = pt.image_ids.shape[0]
            write_next_bytes(fid, track_length, "Q")
            for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
                write_next_bytes(fid, [image_id, point2D_id], "ii")


def get_colmap_camera(camera_obj, render_resolution):
    """
    Extract the intrinsic matrix from a Blender camera.

    Args:
    - camera_obj: The Blender camera object.
    - render_resolution: Tuple of (width, height) indicating the render resolution.

    Returns:
    - colmap_camera: dict of ["id", "model", "width", "height", "params"]
    """

    # Get the camera data
    cam = camera_obj.data

    # Ensure it's a perspective camera
    if cam.type != "PERSP":
        raise ValueError("Only 'PERSP' camera type is supported.")

    # Image resolution
    width, height = render_resolution

    # Sensor width and height in millimeters
    sensor_width_mm = cam.sensor_width
    sensor_height_mm = cam.sensor_height

    # Calculate the focal length in pixels
    fx = (cam.lens / sensor_width_mm) * width
    fy = (cam.lens / sensor_height_mm) * height

    # Principal point, usually at the center of the image
    cx = width / 2.0
    cy = height / 2.0

    _cam_dict = {
        "id": 0,
        "model": "PINHOLE",  # PINHOLE
        "width": width,
        "height": height,
        "params": [fx, fy, cx, cy],
    }

    colmap_cameras = {0: Camera(**_cam_dict)}

    print("focal", fx, fy, cx, cy)

    return colmap_cameras


def main():
    import point_cloud_utils as pcu

    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_mesh_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    # num_frames = int(argv[2])

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    img_output_dir = os.path.join(output_dir, "images")
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    vertices, faces = pcu.load_mesh_vf(inp_mesh_path)

    # normalize
    verices_center = np.mean(vertices, axis=0)
    max_range = np.max(np.max(vertices, axis=0) - np.min(vertices, axis=0))
    print(
        max_range.shape, max_range, verices_center.shape, verices_center, vertices.shape
    )
    vertices = (vertices - verices_center[np.newaxis, :]) / max_range

    # Create the 3D mesh in Blender from your data.
    obj = create_mesh_from_data(vertices, faces)

    object_center = bpy.context.scene.objects["MyMeshObject"].location

    # Number of viewpoints
    num_views = 180  # 180
    radius = 6  # Distance of the camera from the object center

    setup_light()
    # Set up rendering parameters
    bpy.context.scene.render.image_settings.file_format = "PNG"
    bpy.context.scene.render.resolution_x = 1080
    bpy.context.scene.render.resolution_y = 720

    camera = create_camera((1, 1, 1), (0, 0, 0))
    colmap_camera_dict = get_colmap_camera(
        camera,
        (bpy.context.scene.render.resolution_x, bpy.context.scene.render.resolution_y),
    )

    transform_dict = {
        "frames": [],
        "camera_angle_x": focal2fov(
            colmap_camera_dict[0].params[0], colmap_camera_dict[0].width
        ),
    }
    img_indx = 0
    num_elevations = 6
    colmap_images_dict = {}
    for j in range(num_elevations):
        num_imgs = num_views // num_elevations
        for i in range(num_imgs):
            angle = 2 * math.pi * i / num_imgs
            x = object_center.x + radius * math.cos(angle)
            y = object_center.y + radius * math.sin(angle)
            z = (
                object_center.z + (j - num_elevations / 3.0) * 4.0 / num_elevations
            )  # Adjust this if you want the camera to be above or below the object's center

            camera = create_camera((x, y, z), (0, 0, 0))
            rot_quant = set_camera_look_at(camera, object_center)
            tvec = np.array([x, y, z])
            bpy.context.view_layer.update()

            # plan-1
            # w2c = np.array(camera.matrix_world.inverted())
            # w2c[1:3, :] *= -1.0
            # rotation_matrix = w2c[:3, :3]
            # tvec = w2c[:3, 3]
            # plan-1 end

            # plan-2
            camera_to_world_matrix = camera.matrix_world
            # [4, 4]
            camera_to_world_matrix = np.array(camera_to_world_matrix).copy()
            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            camera_to_world_matrix[:3, 1:3] *= -1.0
            w2c = np.linalg.inv(camera_to_world_matrix)
            rotation_matrix = w2c[:3, :3]
            tvec = w2c[:3, 3]

            # c2w rotation
            # rotation_matrix = rot_quant.to_matrix()  # .to_4x4()
            # # w2c rotation
            # rotation_matrix = np.array(rotation_matrix)
            # # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            # rotation_matrix[:3, 1:3] *= -1.0
            # rotation_matrix = rotation_matrix.transpose()
            # tvec = (rotation_matrix @ tvec[:, np.newaxis]).squeeze(axis=-1) * -1.0

            rot_quant = Rotation.from_matrix(rotation_matrix).as_quat()
            # print("r shape", rotation_matrix.shape, tvec.shape)

            img_dict = {
                "id": img_indx,
                "qvec": rot_quant,
                "tvec": tvec,
                "camera_id": 0,
                "name": f"img_{img_indx}.png",
                "xys": [[k, k] for k in range(i, i + 10)],  # placeholder
                "point3D_ids": list(range(i, i + 10)),  # placeholder
            }
            colmap_images_dict[img_indx] = BaseImage(**img_dict)

            # also prepare transforms.json
            fname = f"images/img_{img_indx}"
            cam2world = np.array(camera.matrix_world)
            transform_dict["frames"].append(
                {"file_path": fname, "transform_matrix": cam2world.tolist()}
            )

            # render_scene(camera, os.path.join(img_output_dir, f"img_{img_indx}.png"))
            img_indx += 1

    # sample 3D points
    num_3d_points = 10000

    # sample 3D points
    sampled_points = vertices[np.random.choice(vertices.shape[0], num_3d_points), :]

    print(
        "samping {} points out of {} vertices".format(num_3d_points, vertices.shape[0])
    )

    # save into ply
    pcu.save_mesh_v(
        os.path.join(output_dir, "sampled_point_cloud.ply"),
        sampled_points,
    )

    # format into colmap points format
    colmap_points_dict = {}
    for i in range(num_3d_points):
        pnt_dict = {
            "id": i,
            "xyz": sampled_points[i, :],
            "rgb": np.array([100, 100, 100]),  # place holder , need integers
            "error": 0.0,  # place holder
            "image_ids": np.array(list(range(i, i + 10))),  # placeholder
            "point2D_idxs": np.array(list(range(i, i + 10))),  # placeholder
        }

        colmap_points_dict[i] = Point3D(**pnt_dict)

    trans_fpath = os.path.join(output_dir, "transforms_train.json")
    import json

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)

    output_dir = os.path.join(output_dir, "sparse/0")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Write to binary
    write_cameras_binary(colmap_camera_dict, os.path.join(output_dir, "cameras.bin"))
    write_images_binary(colmap_images_dict, os.path.join(output_dir, "images.bin"))
    write_points3D_binary(colmap_points_dict, os.path.join(output_dir, "points3D.bin"))


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/deforming_things4d.py
================================================
import os
import numpy as np


def anime_read(filename):
    """
    filename: path of .anime file
    return:
        nf: number of frames in the animation
        nv: number of vertices in the mesh (mesh topology fixed through frames)
        nt: number of triangle face in the mesh
        vert_data: vertice data of the 1st frame (3D positions in x-y-z-order). [nv, 3]
        face_data: riangle face data of the 1st frame. [nt, 3] dtype = int32
        offset_data: 3D offset data from the 2nd to the last frame. [nf, nv, 3]
    """
    f = open(filename, "rb")
    nf = np.fromfile(f, dtype=np.int32, count=1)[0]
    nv = np.fromfile(f, dtype=np.int32, count=1)[0]
    nt = np.fromfile(f, dtype=np.int32, count=1)[0]
    vert_data = np.fromfile(f, dtype=np.float32, count=nv * 3)
    face_data = np.fromfile(f, dtype=np.int32, count=nt * 3)
    offset_data = np.fromfile(f, dtype=np.float32, count=-1)
    """check data consistency"""
    if len(offset_data) != (nf - 1) * nv * 3:
        raise ("data inconsistent error!", filename)
    vert_data = vert_data.reshape((-1, 3))
    face_data = face_data.reshape((-1, 3))
    offset_data = offset_data.reshape((nf - 1, nv, 3))
    return nf, nv, nt, vert_data, face_data, offset_data


def extract_trajectory(
    trajectory_array: np.ndarray,
    topk_freq: int = 8,
):
    """
    Args:
        trajectory_array: [nf, nv, 3]. The 3D position of each point in each frame.
        topk_freq: int. FFT frequency.
    """

    # doing fft on trajectory_array
    # [nf, nv, 3]
    trajectory_fft = np.fft.fft(trajectory_array, axis=0)
    # only keep topk_freq
    # [topk_freq, nv, 3]
    trajectory_fft = trajectory_fft[:topk_freq, :, :]
    trajectory_fft[topk_freq:-topk_freq, :, :] = 0

    # doing ifft on trajectory_fft
    # [nf, nv, 3]
    trajectory_array = np.fft.ifft(trajectory_fft, axis=0).real


def main():
    import argparse
    import point_cloud_utils as pcu

    parser = argparse.ArgumentParser(description="None description")

    parser.add_argument("--input", type=str, help="input path")
    parser.add_argument("--output_dir", type=str, help="output path")
    parser.add_argument(
        "--skip",
        type=int,
        default=-1,
        help="skipping between frame saving. -1 indicates only save first frame",
    )

    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    inp_ani_path = args.input

    nf, nv, nt, vert_data, face_data, offset_data = anime_read(inp_ani_path)
    #  face_data:  offset_data [nf, nv, 3]

    # normalize
    verices_center = np.mean(vert_data, axis=0)
    max_range = np.max(np.max(vert_data, axis=0) - np.min(vert_data, axis=0))
    print(
        max_range.shape,
        max_range,
        verices_center.shape,
        verices_center,
        vert_data.shape,
    )
    vert_data = (vert_data - verices_center[np.newaxis, :]) / max_range
    offset_data = offset_data / max_range

    # save trajectory as numpy array

    # [nf, nv, 3]
    trajectory_array = offset_data + vert_data[None, :, :]
    trajectory_array = np.concatenate([vert_data[None, :, :], trajectory_array], axis=0)
    out_traj_path = os.path.join(args.output_dir, "trajectory.npy")
    print("trajectory array of shape [nf, nv, 3]. key: data", trajectory_array.shape)
    save_dict = {
        "help": "trajectory array of shape [nf, nv, 3]. key: data",
        "data": trajectory_array,
    }

    # np.savez(out_traj_path, save_dict)
    # np.save(out_traj_path, trajectory_array)

    if args.skip == -1:
        # save mesh as .obj
        out_obj_path = os.path.join(args.output_dir, "mesh0.obj")
        pcu.save_mesh_vf(out_obj_path, vert_data, face_data)

        return

    for i in range(nf):
        if i % args.skip != 0:
            continue
        out_obj_path = os.path.join(args.output_dir, "mesh{}.obj".format(i))
        pcu.save_mesh_vf(out_obj_path, trajectory_array[i], face_data)


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/dragon_animation.py
================================================
import bpy

# Clear existing data
bpy.ops.wm.read_factory_settings(use_empty=True)

# 1. Import the FBX file
fbx_path = "/local/cg/rundi/data/New-FBX-BVH_Z-OO/Truebone_Z-OO/Dragon/Wyvern-Fly.fbx"
# fbx_path = "../../../data/motion_dataset/pirate-flag-animated/source/pirate_flag.fbx"
bpy.ops.import_scene.fbx(filepath=fbx_path)


# 2. Set up the camera and lighting (assuming they aren't in the FBX)
# Add a camera
bpy.ops.object.camera_add(location=(0, -14, 7))
camera = bpy.context.object
camera.rotation_euler = (1.5708, 0, 0)  # Point the camera towards the origin

# Ensure the camera is in the scene and set it as the active camera
if "Camera" in bpy.data.objects:
    bpy.context.scene.camera = camera
else:
    print("Camera not added!")

# Add a light
bpy.ops.object.light_add(type="SUN", location=(15, -15, 15))

# 3. Set up the render settings
bpy.context.scene.render.engine = "CYCLES"  # or 'EEVEE'
bpy.context.scene.render.image_settings.file_format = "FFMPEG"
bpy.context.scene.render.ffmpeg.format = "MPEG4"
bpy.context.scene.render.ffmpeg.codec = "H264"
bpy.context.scene.render.ffmpeg.constant_rate_factor = "MEDIUM"
bpy.context.scene.render.filepath = ".data/dragon/dragon.mp4"
bpy.context.scene.frame_start = 1
bpy.context.scene.frame_end = 250  # Adjust this based on your needs

# 4. Render the animation
bpy.ops.render.render(animation=True)


================================================
FILE: projects/uncleaned_train/motionrep/datatools/fbx_to_mesh.py
================================================
import bpy
import os
import sys


def convert_to_mesh(fbx_path, output_dir):
    bpy.ops.import_scene.fbx(filepath=fbx_path)

    # Assuming the imported object is the active object
    obj = bpy.context.active_object
    for obj in bpy.context.selected_objects:
        print("obj: ", obj.name, obj.type)
    mesh_objects = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]

    # Get the active object (assuming it's the imported FBX mesh)
    obj = bpy.context.active_object

    for obj in mesh_objects:
        # Add the subdivision modifier
        mod = obj.modifiers.new(name="Subdivision", type="SUBSURF")
        mod.levels = 1  # Set this to the desired subdivision level
        mod.render_levels = 2  # Set this to the desired subdivision level for rendering

        # Apply the modifier
        bpy.context.view_layer.objects.active = obj
        bpy.ops.object.modifier_apply(modifier=mod.name)

    # Set the start and end frames (modify these values if needed)
    start_frame = bpy.context.scene.frame_start
    end_frame = bpy.context.scene.frame_end

    # Iterate through each frame and export
    for frame in range(start_frame, end_frame + 1):
        bpy.context.scene.frame_set(frame)

        # Construct the filename
        filename = os.path.join(
            output_dir, f"frame_{frame:04}.obj"
        )  # Change to .blend for Blender format

        # Export to OBJ format
        bpy.ops.export_scene.obj(filepath=filename, use_selection=True)

        # Uncomment the line below and comment the above line if you want to export in Blender format
        # bpy.ops.wm.save_as_mainfile(filepath=filename, copy=True)

    print("Export complete!")


def convert_obj_to_traj(meshes_dir):
    import glob
    import numpy as np
    import point_cloud_utils as pcu

    meshes = sorted(glob.glob(os.path.join(meshes_dir, "*.obj")))
    print("total of {} meshes: ".format(len(meshes)), meshes[:5], "....")
    traj = []
    R_mat = np.array(
        [[1.0, 0, 0], [0, 0, 1.0], [0, 1.0, 0]],
    )
    for mesh in meshes:
        print(mesh)
        verts, faces = pcu.load_mesh_vf(mesh)
        verts = R_mat[np.newaxis, :, :] @ verts[:, :, np.newaxis]
        verts = verts.squeeze(axis=-1)
        traj.append(verts)
    traj = np.array(traj)

    print("final traj shape", traj.shape)

    save_path = os.path.join(meshes_dir, "traj.npy")
    np.save(save_path, traj)

    save_path = os.path.join(meshes_dir, "../", "traj.npy")
    np.save(save_path, traj)


def main():
    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_fpx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    # num_frames = int(argv[2])

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_dir = os.path.join(output_dir, "meshes")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    convert_to_mesh(inp_fpx_path, output_dir)
    convert_obj_to_traj(output_dir)


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/fbx_to_mesh_flag.py
================================================
import bpy
import os
import sys


def convert_to_mesh(fbx_path, output_dir):
    bpy.ops.import_scene.fbx(filepath=fbx_path)

    # Assuming the imported object is the active object
    original_obj = bpy.context.active_object

    for obj in bpy.context.selected_objects:
        print("obj: ", obj.name, obj.type)

    # Duplicate the original object
    bpy.ops.object.duplicate()
    duplicate_obj = bpy.context.active_object

    # Remove shape keys from the duplicate
    if duplicate_obj.data.shape_keys:
        bpy.context.view_layer.objects.active = duplicate_obj
        bpy.ops.object.shape_key_remove(all=True)

    # Add and apply the subdivision modifier to the duplicate
    mod = duplicate_obj.modifiers.new(name="Subdivision", type="SUBSURF")
    mod.levels = 1
    mod.render_levels = 1
    bpy.context.view_layer.objects.active = duplicate_obj
    bpy.ops.object.modifier_apply(modifier=mod.name)

    # Set the start and end frames
    start_frame = bpy.context.scene.frame_start
    end_frame = bpy.context.scene.frame_end

    # Iterate through each frame, set the shape of the duplicate to match the original, and export
    for frame in range(start_frame, end_frame + 1):
        bpy.context.scene.frame_set(frame)

        # Transfer shape from original to duplicate (this assumes the original animation uses shape keys)
        if original_obj.data.shape_keys:
            for key_block in original_obj.data.shape_keys.key_blocks:
                duplicate_obj.data.vertices.foreach_set("co", key_block.data[:])

        # Construct the filename
        filename = os.path.join(output_dir, f"frame_{frame:04}.obj")

        # Export to OBJ format
        bpy.ops.export_scene.obj(filepath=filename, use_selection=True)

    print("Export complete!")


def subdivde_mesh(mesh_directory, output_directory):
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # List all files in the directory
    all_files = os.listdir(mesh_directory)

    # Filter for .obj files (or change to the format you're using)
    obj_files = [f for f in all_files if f.endswith(".obj")]

    for obj_file in obj_files:
        # Construct full path
        full_path = os.path.join(mesh_directory, obj_file)

        # Clear existing mesh data
        bpy.ops.object.select_all(action="DESELECT")
        bpy.ops.object.select_by_type(type="MESH")
        bpy.ops.object.delete()

        # Import the mesh
        bpy.ops.import_scene.obj(filepath=full_path)

        # Select all imported objects (assuming they are meshes)
        bpy.ops.object.select_all(action="SELECT")

        # Apply subdivision
        for obj in bpy.context.selected_objects:
            if obj.type == "MESH":
                print("apply subdivide to: ", obj.name)
                mod = obj.modifiers.new(name="Subdivision", type="SUBSURF")
                mod.levels = 1
                mod.render_levels = 1
                bpy.context.view_layer.objects.active = obj
                bpy.ops.object.modifier_apply(modifier=mod.name)

        # Export the mesh with subdivision
        output_path = os.path.join(output_directory, obj_file)
        bpy.ops.export_scene.obj(filepath=output_path, use_selection=True)

    print("Processing complete!")


def convert_obj_to_traj(meshes_dir):
    import glob
    import numpy as np
    import point_cloud_utils as pcu

    meshes = sorted(glob.glob(os.path.join(meshes_dir, "*.obj")))
    print("total of {} meshes: ".format(len(meshes)), meshes[:5], "....")
    traj = []
    R_mat = np.array(
        [[1.0, 0, 0], [0, 0, 1.0], [0, 1.0, 0]],
    )
    for mesh in meshes:
        print(mesh)
        verts, faces = pcu.load_mesh_vf(mesh)
        verts = R_mat[np.newaxis, :, :] @ verts[:, :, np.newaxis]
        verts = verts.squeeze(axis=-1)
        traj.append(verts)
    traj = np.array(traj)

    print("final traj shape", traj.shape)

    save_path = os.path.join(meshes_dir, "traj.npy")
    np.save(save_path, traj)

    save_path = os.path.join(meshes_dir, "../", "traj.npy")
    np.save(save_path, traj)


def main():
    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_fpx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    # num_frames = int(argv[2])

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_dir = os.path.join(output_dir, "meshes")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # convert_to_mesh(inp_fpx_path, output_dir)
    dense_output_dir = os.path.join(output_dir, "denser_mesh")
    os.makedirs(dense_output_dir, exist_ok=True)
    subdivde_mesh(output_dir, dense_output_dir)
    convert_obj_to_traj(dense_output_dir)


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/render_blender_annimations.py
================================================
import bpy
import os
import numpy as np
import math
import sys
import struct
import collections
from mathutils import Matrix, Quaternion
from scipy.spatial.transform import Rotation


def focal2fov(focal, pixels):
    return 2 * math.atan(pixels / (2 * focal))


def create_camera(location, rotation):
    # Create a new camera
    bpy.ops.object.camera_add(location=location, rotation=rotation)
    return bpy.context.active_object


def set_camera_look_at(camera, target_point):
    # Compute the direction vector from the camera to the target point
    direction = target_point - camera.location
    # Compute the rotation matrix to align the camera's -Z axis to this direction
    rot_quat = direction.to_track_quat("-Z", "Y")
    camera.rotation_euler = rot_quat.to_euler()

    return rot_quat


def setup_alpha_mask(obj_name, pass_index=1):
    # Set the object's pass index
    obj = bpy.data.objects[obj_name]
    obj.pass_index = pass_index

    # Enable the Object Index pass for the active render layer
    bpy.context.view_layer.use_pass_object_index = True

    # Enable 'Use Nodes':
    bpy.context.scene.use_nodes = True
    tree = bpy.context.scene.node_tree

    # Clear default nodes
    for node in tree.nodes:
        tree.nodes.remove(node)

    # Add Render Layers node
    render_layers = tree.nodes.new("CompositorNodeRLayers")

    # Add Composite node (output)
    composite = tree.nodes.new("CompositorNodeComposite")

    # Add ID Mask node
    id_mask = tree.nodes.new("CompositorNodeIDMask")
    id_mask.index = pass_index

    # Add Set Alpha node
    set_alpha = tree.nodes.new("CompositorNodeSetAlpha")

    # Connect nodes
    tree.links.new(render_layers.outputs["Image"], set_alpha.inputs["Image"])
    tree.links.new(render_layers.outputs["IndexOB"], id_mask.inputs[0])
    tree.links.new(id_mask.outputs[0], set_alpha.inputs["Alpha"])
    tree.links.new(set_alpha.outputs["Image"], composite.inputs["Image"])


def render_scene(camera, output_path):
    bpy.context.scene.render.film_transparent = True

    setup_alpha_mask("MyMeshObject", 1)
    # Set the active camera
    bpy.context.scene.render.image_settings.color_mode = "RGBA"

    bpy.context.scene.camera = camera

    # Set the output path for the render
    bpy.context.scene.render.filepath = output_path

    # Render the scene
    bpy.ops.render.render(write_still=True)


def setup_light():
    # Add first directional light (Sun lamp)
    light_data_1 = bpy.data.lights.new(name="Directional_Light_1", type="SUN")
    light_data_1.energy = 3  # Adjust energy as needed
    light_1 = bpy.data.objects.new(name="Directional_Light_1", object_data=light_data_1)
    bpy.context.collection.objects.link(light_1)
    light_1.location = (10, 10, 10)  # Adjust location as needed
    light_1.rotation_euler = (
        np.radians(45),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_2 = bpy.data.lights.new(name="Directional_Light_2", type="SUN")
    light_data_2.energy = 5  # Adjust energy as needed
    light_2 = bpy.data.objects.new(name="Directional_Light_2", object_data=light_data_2)
    bpy.context.collection.objects.link(light_2)
    light_2.location = (10, -10, 10)  # Adjust location as needed
    light_2.rotation_euler = (
        np.radians(45),
        np.radians(180),
        np.radians(45),
    )  # Adjust rotation for direction


def create_mesh_from_data(vertices, faces):
    # Clear existing mesh objects in the scene
    bpy.ops.object.select_all(action="DESELECT")
    bpy.ops.object.select_by_type(type="MESH")
    bpy.ops.object.delete()

    vertices_list = vertices.tolist()
    faces_list = faces.tolist()

    # Create a new mesh
    mesh_name = "MyMesh"
    mesh = bpy.data.meshes.new(name=mesh_name)
    obj = bpy.data.objects.new("MyMeshObject", mesh)

    # Link it to the scene
    bpy.context.collection.objects.link(obj)
    bpy.context.view_layer.objects.active = obj
    obj.select_set(True)

    # Load the mesh data
    mesh.from_pydata(vertices_list, [], faces_list)
    mesh.update()

    # mesh_data = bpy.data.meshes.new(mesh_name)
    # mesh_data.from_pydata(vertices_list, [], faces_list)
    # mesh_data.update()
    # the_mesh = bpy.data.objects.new(mesh_name, mesh_data)
    # the_mesh.data.vertex_colors.new()  # init color
    # bpy.context.collection.objects.link(the_mesh)

    # UV unwrap the mesh
    bpy.ops.object.select_all(action="DESELECT")
    obj.select_set(True)
    bpy.context.view_layer.objects.active = obj
    bpy.ops.object.mode_set(mode="EDIT")
    bpy.ops.mesh.select_all(action="SELECT")
    bpy.ops.uv.smart_project()
    bpy.ops.object.mode_set(mode="OBJECT")

    # Texture the mesh based on its normals
    mat = bpy.data.materials.new(name="NormalMaterial")
    mat.use_nodes = True
    bsdf = mat.node_tree.nodes["Principled BSDF"]
    normal_node = mat.node_tree.nodes.new(type="ShaderNodeNormal")
    geometry = mat.node_tree.nodes.new(type="ShaderNodeNewGeometry")

    # mat.node_tree.links.new(geometry.outputs["Normal"], normal_node.inputs["Normal"])
    # mat.node_tree.links.new(normal_node.outputs["Dot"], bsdf.inputs["Base Color"])
    mat.node_tree.links.new(geometry.outputs["Normal"], bsdf.inputs["Base Color"])

    obj.data.materials.append(mat)

    return None


CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"]
)
Camera = collections.namedtuple("Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]
)
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]
)


CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12),
}
CAMERA_MODEL_IDS = dict(
    [(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]
)
CAMERA_MODEL_NAMES = dict(
    [(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]
)


def write_next_bytes(fid, data, format_char_sequence, endian_character="<"):
    """pack and write to a binary file.
    :param fid:
    :param data: data to send, if multiple elements are sent at the same time,
    they should be encapsuled either in a list or a tuple
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    should be the same length as the data list or tuple
    :param endian_character: Any of {@, =, <, >, !}
    """
    if isinstance(data, (list, tuple)):
        bytes = struct.pack(endian_character + format_char_sequence, *data)
    else:
        bytes = struct.pack(endian_character + format_char_sequence, data)
    fid.write(bytes)


def write_cameras_binary(cameras, path_to_model_file):
    """
    see: src/colmap/scene/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    with open(path_to_model_file, "wb") as fid:
        write_next_bytes(fid, len(cameras), "Q")
        for _, cam in cameras.items():
            model_id = CAMERA_MODEL_NAMES[cam.model].model_id
            camera_properties = [cam.id, model_id, cam.width, cam.height]
            write_next_bytes(fid, camera_properties, "iiQQ")
            for p in cam.params:
                write_next_bytes(fid, float(p), "d")
    return cameras


def write_images_binary(images, path_to_model_file):
    """
    see: src/colmap/scene/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    with open(path_to_model_file, "wb") as fid:
        write_next_bytes(fid, len(images), "Q")
        for _, img in images.items():
            write_next_bytes(fid, img.id, "i")
            write_next_bytes(fid, img.qvec.tolist(), "dddd")
            write_next_bytes(fid, img.tvec.tolist(), "ddd")
            write_next_bytes(fid, img.camera_id, "i")
            for char in img.name:
                write_next_bytes(fid, char.encode("utf-8"), "c")
            write_next_bytes(fid, b"\x00", "c")
            write_next_bytes(fid, len(img.point3D_ids), "Q")
            for xy, p3d_id in zip(img.xys, img.point3D_ids):
                write_next_bytes(fid, [*xy, p3d_id], "ddq")


def write_points3D_binary(points3D, path_to_model_file):
    """
    see: src/colmap/scene/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """
    with open(path_to_model_file, "wb") as fid:
        write_next_bytes(fid, len(points3D), "Q")
        for _, pt in points3D.items():
            write_next_bytes(fid, pt.id, "Q")
            write_next_bytes(fid, pt.xyz.tolist(), "ddd")
            write_next_bytes(fid, pt.rgb.tolist(), "BBB")
            write_next_bytes(fid, pt.error, "d")
            track_length = pt.image_ids.shape[0]
            write_next_bytes(fid, track_length, "Q")
            for image_id, point2D_id in zip(pt.image_ids, pt.point2D_idxs):
                write_next_bytes(fid, [image_id, point2D_id], "ii")


def get_colmap_camera(camera_obj, render_resolution):
    """
    Extract the intrinsic matrix from a Blender camera.

    Args:
    - camera_obj: The Blender camera object.
    - render_resolution: Tuple of (width, height) indicating the render resolution.

    Returns:
    - colmap_camera: dict of ["id", "model", "width", "height", "params"]
    """

    # Get the camera data
    cam = camera_obj.data

    # Ensure it's a perspective camera
    if cam.type != "PERSP":
        raise ValueError("Only 'PERSP' camera type is supported.")

    # Image resolution
    width, height = render_resolution

    # Sensor width and height in millimeters
    sensor_width_mm = cam.sensor_width
    sensor_height_mm = cam.sensor_height

    # Calculate the focal length in pixels
    fx = (cam.lens / sensor_width_mm) * width
    fy = (cam.lens / sensor_height_mm) * height

    # Principal point, usually at the center of the image
    cx = width / 2.0
    cy = height / 2.0

    _cam_dict = {
        "id": 0,
        "model": "PINHOLE",  # PINHOLE
        "width": width,
        "height": height,
        "params": [fx, fy, cx, cy],
    }

    colmap_cameras = {0: Camera(**_cam_dict)}

    print("focal", fx, fy, cx, cy)

    return colmap_cameras


def main():
    import point_cloud_utils as pcu

    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_mesh_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    # num_frames = int(argv[2])

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    img_output_dir = os.path.join(output_dir, "images")
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    tmp_mesh_path = "data/deer_attack/mesh0.obj"

    vertices, faces = pcu.load_mesh_vf(tmp_mesh_path)
    # normalize
    verices_center = np.mean(vertices, axis=0)
    max_range = np.max(np.max(vertices, axis=0) - np.min(vertices, axis=0))
    print(
        max_range.shape, max_range, verices_center.shape, verices_center, vertices.shape
    )

    vertices, faces = pcu.load_mesh_vf(inp_mesh_path)

    mesh_name = os.path.basename(inp_mesh_path).split(".")[0]

    vertices = (vertices - verices_center[np.newaxis, :]) / max_range

    # Create the 3D mesh in Blender from your data.
    obj = create_mesh_from_data(vertices, faces)

    object_center = bpy.context.scene.objects["MyMeshObject"].location

    # Number of viewpoints
    num_views = 180  # 180
    radius = 6  # Distance of the camera from the object center

    setup_light()
    # Set up rendering parameters
    bpy.context.scene.render.image_settings.file_format = "PNG"
    bpy.context.scene.render.resolution_x = 1080
    bpy.context.scene.render.resolution_y = 720

    camera = create_camera((1, 1, 1), (0, 0, 0))
    colmap_camera_dict = get_colmap_camera(
        camera,
        (bpy.context.scene.render.resolution_x, bpy.context.scene.render.resolution_y),
    )

    transform_dict = {
        "frames": [],
        "camera_angle_x": focal2fov(
            colmap_camera_dict[0].params[0], colmap_camera_dict[0].width
        ),
    }
    img_indx = 0
    num_elevations = 6
    colmap_images_dict = {}
    for j in range(num_elevations):
        num_imgs = num_views // num_elevations
        for i in range(num_imgs):
            angle = 2 * math.pi * i / num_imgs
            x = object_center.x + radius * math.cos(angle)
            y = object_center.y + radius * math.sin(angle)
            z = (
                object_center.z + (j - num_elevations / 3.0) * 4.0 / num_elevations
            )  # Adjust this if you want the camera to be above or below the object's center

            camera = create_camera((x, y, z), (0, 0, 0))
            rot_quant = set_camera_look_at(camera, object_center)
            tvec = np.array([x, y, z])
            bpy.context.view_layer.update()

            # plan-1
            # w2c = np.array(camera.matrix_world.inverted())
            # w2c[1:3, :] *= -1.0
            # rotation_matrix = w2c[:3, :3]
            # tvec = w2c[:3, 3]
            # plan-1 end

            # plan-2
            camera_to_world_matrix = camera.matrix_world
            # [4, 4]
            camera_to_world_matrix = np.array(camera_to_world_matrix).copy()
            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            camera_to_world_matrix[:3, 1:3] *= -1.0
            w2c = np.linalg.inv(camera_to_world_matrix)
            rotation_matrix = w2c[:3, :3]
            tvec = w2c[:3, 3]

            # c2w rotation
            # rotation_matrix = rot_quant.to_matrix()  # .to_4x4()
            # # w2c rotation
            # rotation_matrix = np.array(rotation_matrix)
            # # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            # rotation_matrix[:3, 1:3] *= -1.0
            # rotation_matrix = rotation_matrix.transpose()
            # tvec = (rotation_matrix @ tvec[:, np.newaxis]).squeeze(axis=-1) * -1.0

            rot_quant = Rotation.from_matrix(rotation_matrix).as_quat()
            # print("r shape", rotation_matrix.shape, tvec.shape)

            img_dict = {
                "id": img_indx,
                "qvec": rot_quant,
                "tvec": tvec,
                "camera_id": 0,
                "name": f"img_{img_indx}.png",
                "xys": [[k, k] for k in range(i, i + 10)],  # placeholder
                "point3D_ids": list(range(i, i + 10)),  # placeholder
            }
            colmap_images_dict[img_indx] = BaseImage(**img_dict)

            # also prepare transforms.json
            fname = f"images/img_{img_indx}"
            cam2world = np.array(camera.matrix_world)
            transform_dict["frames"].append(
                {"file_path": fname, "transform_matrix": cam2world.tolist()}
            )

            render_scene(camera, os.path.join(img_output_dir, f"img_{mesh_name}.png"))
            img_indx += 1

            return


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/render_fbx_first_frame.py
================================================
import bpy
import os
import numpy as np
import math
import sys
import struct
import collections
from mathutils import Matrix, Quaternion, Vector
from scipy.spatial.transform import Rotation


def focal2fov(focal, pixels):
    return 2 * math.atan(pixels / (2 * focal))


def create_camera(location, rotation):
    # Create a new camera
    bpy.ops.object.camera_add(location=location, rotation=rotation)
    return bpy.context.active_object


def set_camera_look_at(camera, target_point):
    # Compute the direction vector from the camera to the target point
    direction = target_point - camera.location
    # Compute the rotation matrix to align the camera's -Z axis to this direction
    rot_quat = direction.to_track_quat("-Z", "Y")
    camera.rotation_euler = rot_quat.to_euler()

    return rot_quat


def setup_alpha_mask(obj_name, pass_index=1):
    # Set the object's pass index
    obj = bpy.data.objects[obj_name]
    obj.pass_index = pass_index

    # Enable the Object Index pass for the active render layer
    bpy.context.view_layer.use_pass_object_index = True

    # Enable 'Use Nodes':
    bpy.context.scene.use_nodes = True
    tree = bpy.context.scene.node_tree

    # Clear default nodes
    for node in tree.nodes:
        tree.nodes.remove(node)

    # Add Render Layers node
    render_layers = tree.nodes.new("CompositorNodeRLayers")

    # Add Composite node (output)
    composite = tree.nodes.new("CompositorNodeComposite")

    # Add ID Mask node
    id_mask = tree.nodes.new("CompositorNodeIDMask")
    id_mask.index = pass_index

    # Add Set Alpha node
    set_alpha = tree.nodes.new("CompositorNodeSetAlpha")

    # Connect nodes
    tree.links.new(render_layers.outputs["Image"], set_alpha.inputs["Image"])
    tree.links.new(render_layers.outputs["IndexOB"], id_mask.inputs[0])
    tree.links.new(id_mask.outputs[0], set_alpha.inputs["Alpha"])
    tree.links.new(set_alpha.outputs["Image"], composite.inputs["Image"])


def render_scene(camera, output_path, mask_name="U3DMesh"):
    bpy.context.scene.render.film_transparent = True

    setup_alpha_mask(mask_name, 1)
    # Set the active camera
    bpy.context.scene.render.image_settings.color_mode = "RGBA"

    bpy.context.scene.camera = camera

    # Set the output path for the render
    bpy.context.scene.render.filepath = output_path

    # Render the scene
    bpy.ops.render.render(write_still=True)


def normalize_mesh(obj):
    # Ensure the object is a mesh
    if obj.type != "MESH":
        print(f"{obj.name} is not a mesh object.")
        return

    # Calculate the mesh's bounding box dimensions
    bbox = obj.bound_box
    dimensions = obj.dimensions

    # Calculate the center of the bounding box
    center = [(bbox[i][0] + bbox[(i + 4) % 8][0]) * 0.5 for i in range(4)]

    # Move the object to the origin based on its bounding box center
    obj.location = [
        0.0,
        0.0,
        0.0,
    ]  # [-center[0], -center[1], -center[2]]  #  [0.0, 0.0, 0.0]

    # Calculate the scaling factor based on the largest dimension
    scale_factor = 1.0 / max(dimensions)
    print(obj.scale, "prev")
    # Apply the scale to the object
    obj.scale = [scale_factor] * 3

    print("scalar", obj.scale, scale_factor)
    # Update the scene (important for getting correct visual updates)
    bpy.context.view_layer.update()


def setup_light():
    # Add first directional light (Sun lamp)
    light_data_1 = bpy.data.lights.new(name="Directional_Light_1", type="SUN")
    light_data_1.energy = 3  # Adjust energy as needed
    light_1 = bpy.data.objects.new(name="Directional_Light_1", object_data=light_data_1)
    bpy.context.collection.objects.link(light_1)
    light_1.location = (20, 20, 20)  # Adjust location as needed
    light_1.rotation_euler = (
        np.radians(45),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_2 = bpy.data.lights.new(name="Directional_Light_2", type="SUN")
    light_data_2.energy = 5  # Adjust energy as needed
    light_2 = bpy.data.objects.new(name="Directional_Light_2", object_data=light_data_2)
    bpy.context.collection.objects.link(light_2)
    light_2.location = (20, -20, 20)  # Adjust location as needed
    light_2.rotation_euler = (
        np.radians(45),
        np.radians(180),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_3 = bpy.data.lights.new(name="Directional_Light_3", type="SUN")
    light_data_3.energy = 3  # Adjust energy as needed
    light_3 = bpy.data.objects.new(name="Directional_Light_3", object_data=light_data_2)
    bpy.context.collection.objects.link(light_3)
    light_3.location = (-20, 20, 20)  # Adjust location as needed
    light_3.rotation_euler = (
        np.radians(-135),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction


def create_mesh_from_fpx(fbx_path):
    # Clear existing mesh objects in the scene
    bpy.ops.object.select_all(action="DESELECT")
    bpy.ops.object.select_by_type(type="MESH")
    bpy.ops.object.delete()

    bpy.ops.import_scene.fbx(filepath=fbx_path, use_image_search=True)

    # Assuming the imported object is the active object
    obj = bpy.context.active_object
    for obj in bpy.context.selected_objects:
        print("obj: ", obj.name, obj.type)
    mesh_objects = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]

    return None


Camera = collections.namedtuple("Camera", ["id", "model", "width", "height", "params"])


def get_colmap_camera(camera_obj, render_resolution):
    """
    Extract the intrinsic matrix from a Blender camera.

    Args:
    - camera_obj: The Blender camera object.
    - render_resolution: Tuple of (width, height) indicating the render resolution.

    Returns:
    - colmap_camera: dict of ["id", "model", "width", "height", "params"]
    """

    # Get the camera data
    cam = camera_obj.data

    # Ensure it's a perspective camera
    if cam.type != "PERSP":
        raise ValueError("Only 'PERSP' camera type is supported.")

    # Image resolution
    width, height = render_resolution

    # Sensor width and height in millimeters
    sensor_width_mm = cam.sensor_width
    sensor_height_mm = cam.sensor_height

    # Calculate the focal length in pixels
    fx = (cam.lens / sensor_width_mm) * width
    fy = (cam.lens / sensor_height_mm) * height

    # Principal point, usually at the center of the image
    cx = width / 2.0
    cy = height / 2.0

    _cam_dict = {
        "id": 0,
        "model": "PINHOLE",  # PINHOLE
        "width": width,
        "height": height,
        "params": [fx, fy, cx, cy],
    }

    colmap_cameras = {0: Camera(**_cam_dict)}

    print("focal", fx, fy, cx, cy)

    return colmap_cameras


def get_textures(
    texture_dir="/local/cg/rundi/data/motion_dataset/pirate-flag-animated/source/textures",
):
    # Ensure the "flag" object is selected
    # bpy.context.view_layer.objects.active = bpy.data.objects["flag"]

    obj = bpy.data.objects["flag"]

    # Create a new material or get the existing one
    if not obj.data.materials:
        mat = bpy.data.materials.new(name="FBX_Material")
        obj.data.materials.append(mat)
    else:
        mat = obj.data.materials[0]

    # Use nodes for the material
    mat.use_nodes = True
    nodes = mat.node_tree.nodes

    # Clear default nodes
    for node in nodes:
        nodes.remove(node)

    # Add a Principled BSDF shader and connect it to the Material Output
    shader = nodes.new(type="ShaderNodeBsdfPrincipled")
    shader.location = (0, 0)

    output = nodes.new(type="ShaderNodeOutputMaterial")
    output.location = (400, 0)
    mat.node_tree.links.new(shader.outputs["BSDF"], output.inputs["Surface"])

    # Load textures and create the corresponding nodes

    textures = {
        "Base Color": os.path.join(texture_dir, "pirate_flag_albedo.jpg"),
        "Metallic": os.path.join(texture_dir, "pirate_flag_metallic.jpg"),
        "Normal": os.path.join(texture_dir, "pirate_flag_normal.png"),
        "Roughness": os.path.join(texture_dir, "pirate_flag_roughness.jpg"),
    }
    # ... [rest of the script]

    ao_texture = nodes.new(type="ShaderNodeTexImage")
    ao_texture.location = (-400, -200)
    ao_texture.image = bpy.data.images.load(
        filepath=os.path.join(texture_dir, "pirate_flag_AO.jpg")
    )  # Adjust filepath if needed

    mix_rgb = nodes.new(type="ShaderNodeMixRGB")
    mix_rgb.location = (-200, 0)
    mix_rgb.blend_type = "MULTIPLY"
    mix_rgb.inputs[
        0
    ].default_value = 1.0  # Factor to 1 to fully use the multiply operation

    mat.node_tree.links.new(ao_texture.outputs["Color"], mix_rgb.inputs[2])

    for i, (input_name, filename) in enumerate(textures.items()):
        tex_image = nodes.new(type="ShaderNodeTexImage")
        tex_image.location = (-400, i * 200)
        tex_image.image = bpy.data.images.load(
            filepath=filename
        )  # Adjust filepath if needed

        if input_name == "Base Color":
            mat.node_tree.links.new(tex_image.outputs["Color"], mix_rgb.inputs[1])
            mat.node_tree.links.new(mix_rgb.outputs["Color"], shader.inputs[input_name])
        elif input_name == "Normal":
            normal_map_node = nodes.new(type="ShaderNodeNormalMap")
            normal_map_node.location = (-200, i * 200)
            mat.node_tree.links.new(
                tex_image.outputs["Color"], normal_map_node.inputs["Color"]
            )
            mat.node_tree.links.new(
                normal_map_node.outputs["Normal"], shader.inputs["Normal"]
            )
        else:
            mat.node_tree.links.new(
                tex_image.outputs["Color"], shader.inputs[input_name]
            )


def main():
    import point_cloud_utils as pcu

    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_fpx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    # num_frames = int(argv[2])

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    img_output_dir = os.path.join(output_dir, "images")
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    # Create the 3D mesh in Blender from your data. no normalize
    obj = create_mesh_from_fpx(inp_fpx_path)
    my_mesh_name = "U3DMesh"  #  "flag"  # "U3DMesh"  # "flag"  # "U3DMesh" for dragon

    # get_textures()
    # normalize_mesh(bpy.context.scene.objects[my_mesh_name])
    object_center = bpy.context.scene.objects[my_mesh_name].location + Vector(
        (0, 0, 2)
    )  # (0, 0, 2) for dragon

    print("look at object center: ", object_center)
    # Number of viewpoints
    num_views = 270  # 180   # 240 for dragon
    radius = 20  #  16 for dragon

    setup_light()
    # Set up rendering parameters
    bpy.context.scene.render.image_settings.file_format = "PNG"
    bpy.context.scene.render.resolution_x = 1080
    bpy.context.scene.render.resolution_y = 720

    camera = create_camera((1, 1, 1), (0, 0, 0))
    colmap_camera_dict = get_colmap_camera(
        camera,
        (bpy.context.scene.render.resolution_x, bpy.context.scene.render.resolution_y),
    )

    transform_dict = {
        "frames": [],
        "camera_angle_x": focal2fov(
            colmap_camera_dict[0].params[0], colmap_camera_dict[0].width
        ),
    }
    img_indx = 0
    num_elevations = 8
    for j in range(num_elevations):
        num_imgs = num_views // num_elevations
        for i in range(num_imgs):
            angle = 2 * math.pi * i / num_imgs
            x = object_center.x + radius * math.cos(angle)
            y = object_center.y + radius * math.sin(angle)
            z = (
                object_center.z
                + (j - num_elevations / 2.0) * (radius * 2) / num_elevations
            )  # Adjust this if you want the camera to be above or below the object's center

            camera = create_camera((x, y, z), (0, 0, 0))
            rot_quant = set_camera_look_at(camera, object_center)
            tvec = np.array([x, y, z])
            bpy.context.view_layer.update()

            # also prepare transforms.json
            fname = f"images/img_{img_indx}"
            cam2world = np.array(camera.matrix_world)
            transform_dict["frames"].append(
                {"file_path": fname, "transform_matrix": cam2world.tolist()}
            )

            render_scene(
                camera,
                os.path.join(img_output_dir, f"img_{img_indx}.png"),
                my_mesh_name,
            )
            img_indx += 1

    trans_fpath = os.path.join(output_dir, "transforms_train.json")
    import json

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)

    transform_dict["frames"] = transform_dict["frames"][::10]
    trans_fpath = os.path.join(output_dir, "transforms_test.json")

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)


def find_material():
    import point_cloud_utils as pcu

    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_fpx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir

    bpy.ops.import_scene.fbx(filepath=inp_fpx_path)

    print("inspecting materials")
    for material in bpy.data.materials:
        if material.use_nodes:
            for node in material.node_tree.nodes:
                if node.type == "TEX_IMAGE":
                    print(
                        f"Material: {material.name}, Image: {node.image.name}, Path: {node.image.filepath}"
                    )


if __name__ == "__main__":
    main()
    # find_material()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/render_obj.py
================================================
import bpy
import os
import numpy as np
import math
import sys
import struct
import collections
from mathutils import Matrix, Quaternion, Vector
from scipy.spatial.transform import Rotation


def focal2fov(focal, pixels):
    return 2 * math.atan(pixels / (2 * focal))


def create_camera(location, rotation):
    # Create a new camera
    bpy.ops.object.camera_add(location=location, rotation=rotation)
    return bpy.context.active_object


def set_camera_look_at(camera, target_point):
    # Compute the direction vector from the camera to the target point
    direction = target_point - camera.location
    # Compute the rotation matrix to align the camera's -Z axis to this direction
    rot_quat = direction.to_track_quat("-Z", "Y")
    camera.rotation_euler = rot_quat.to_euler()

    return rot_quat


def setup_alpha_mask(obj_name, pass_index=1):
    # Set the object's pass index
    obj = bpy.data.objects[obj_name]
    obj.pass_index = pass_index

    # Enable the Object Index pass for the active render layer
    bpy.context.view_layer.use_pass_object_index = True

    # Enable 'Use Nodes':
    bpy.context.scene.use_nodes = True
    tree = bpy.context.scene.node_tree

    # Clear default nodes
    for node in tree.nodes:
        tree.nodes.remove(node)

    # Add Render Layers node
    render_layers = tree.nodes.new("CompositorNodeRLayers")

    # Add Composite node (output)
    composite = tree.nodes.new("CompositorNodeComposite")

    # Add ID Mask node
    id_mask = tree.nodes.new("CompositorNodeIDMask")
    id_mask.index = pass_index

    # Add Set Alpha node
    set_alpha = tree.nodes.new("CompositorNodeSetAlpha")

    # Connect nodes
    tree.links.new(render_layers.outputs["Image"], set_alpha.inputs["Image"])
    tree.links.new(render_layers.outputs["IndexOB"], id_mask.inputs[0])
    tree.links.new(id_mask.outputs[0], set_alpha.inputs["Alpha"])
    tree.links.new(set_alpha.outputs["Image"], composite.inputs["Image"])


def render_scene(camera, output_path, mask_name="U3DMesh"):
    bpy.context.scene.render.film_transparent = True

    setup_alpha_mask(mask_name, 1)
    # Set the active camera
    bpy.context.scene.render.image_settings.color_mode = "RGBA"

    bpy.context.scene.camera = camera

    # Set the output path for the render
    bpy.context.scene.render.filepath = output_path

    # Render the scene
    bpy.ops.render.render(write_still=True)


def setup_light():
    # Add first directional light (Sun lamp)
    light_data_1 = bpy.data.lights.new(name="Directional_Light_1", type="SUN")
    light_data_1.energy = 3  # Adjust energy as needed
    light_1 = bpy.data.objects.new(name="Directional_Light_1", object_data=light_data_1)
    bpy.context.collection.objects.link(light_1)
    light_1.location = (20, 20, 20)  # Adjust location as needed
    light_1.rotation_euler = (
        np.radians(45),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_2 = bpy.data.lights.new(name="Directional_Light_2", type="SUN")
    light_data_2.energy = 5  # Adjust energy as needed
    light_2 = bpy.data.objects.new(name="Directional_Light_2", object_data=light_data_2)
    bpy.context.collection.objects.link(light_2)
    light_2.location = (20, -20, 20)  # Adjust location as needed
    light_2.rotation_euler = (
        np.radians(45),
        np.radians(180),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_3 = bpy.data.lights.new(name="Directional_Light_3", type="SUN")
    light_data_3.energy = 3  # Adjust energy as needed
    light_3 = bpy.data.objects.new(name="Directional_Light_3", object_data=light_data_2)
    bpy.context.collection.objects.link(light_3)
    light_3.location = (-20, 20, 20)  # Adjust location as needed
    light_3.rotation_euler = (
        np.radians(-135),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction


def create_mesh_from_obj(obj_file_path):
    # Clear existing mesh objects in the scene
    bpy.ops.object.select_all(action="DESELECT")
    bpy.ops.object.select_by_type(type="MESH")
    bpy.ops.object.delete()

    bpy.ops.import_scene.obj(filepath=obj_file_path)

    # Assuming the imported object is the active object
    obj = bpy.context.active_object
    num_obj = 0
    for obj in bpy.context.selected_objects:
        print("obj mesh name: ", obj.name, obj.type)
        num_obj += 1
    if num_obj > 1:
        raise ValueError("More than one object in the scene.")
    mesh_objects = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]

    return obj.name, mesh_objects


def get_focal_length(camera_obj, render_resolution):
    """
    Extract the intrinsic matrix from a Blender camera.

    Args:
    - camera_obj: The Blender camera object.
    - render_resolution: Tuple of (width, height) indicating the render resolution.

    Returns:
    - colmap_camera: dict of ["id", "model", "width", "height", "params"]
    """

    # Get the camera data
    cam = camera_obj.data

    # Ensure it's a perspective camera
    if cam.type != "PERSP":
        raise ValueError("Only 'PERSP' camera type is supported.")

    # Image resolution
    width, height = render_resolution

    # Sensor width and height in millimeters
    sensor_width_mm = cam.sensor_width
    sensor_height_mm = cam.sensor_height

    # Calculate the focal length in pixels
    fx = (cam.lens / sensor_width_mm) * width
    fy = (cam.lens / sensor_height_mm) * height

    return fx, fy


def normalize_mesh(transform_meta_path, mesh_objects):
    import json

    if os.path.exists(transform_meta_path):
        with open(transform_meta_path, "r") as f:
            meta_dict = json.load(f)
    # obj = bpy.context.active_object

    for obj in mesh_objects:
        # Ensure the object is in object mode
        # bpy.ops.object.mode_set(mode="OBJECT")

        scale_ = 1.0 / meta_dict["scale"]
        center = Vector(meta_dict["center"])
        # Apply the scale
        print("old scale: ", obj.scale)
        # obj.location -= center
        obj.scale *= scale_


def apply_rotation(mesh_objects):
    for obj in mesh_objects:
        R_np = [[1.0, 0, 0], [0, 0, 1.0], [0, 1.0, 0]]
        R_blender = Matrix(R_np).transposed()

        # Convert the rotation matrix to a quaternion
        quaternion = R_blender.to_quaternion()

        # Set the active object's rotation to this quaternion
        print("rotation", quaternion, obj.rotation_quaternion)
        obj.rotation_quaternion = obj.rotation_quaternion @ quaternion


def main():
    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_fpx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    num_views = int(argv[2])
    radius = 5

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    img_output_dir = os.path.join(output_dir, "images")
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    transform_meta_path = os.path.join(os.path.dirname(inp_fpx_path), "meta.json")

    # Create the 3D mesh in Blender from your data. no normalize
    my_mesh_name, mesh_objects = create_mesh_from_obj(inp_fpx_path)

    normalize_mesh(transform_meta_path, mesh_objects)
    # apply_rotation(mesh_objects)

    object_center = Vector((0.0, 0.0, 1.0))

    print("look at object center: ", object_center)

    setup_light()
    # Set up rendering parameters
    bpy.context.scene.render.image_settings.file_format = "PNG"
    # bpy.context.scene.render.resolution_x = 1080
    # bpy.context.scene.render.resolution_y = 720

    bpy.context.scene.render.resolution_x = 720
    bpy.context.scene.render.resolution_y = 480

    camera = create_camera((1, 1, 1), (0, 0, 0))
    fx, fy = get_focal_length(
        camera,
        (bpy.context.scene.render.resolution_x, bpy.context.scene.render.resolution_y),
    )

    transform_dict = {
        "frames": [],
        "camera_angle_x": focal2fov(fx, bpy.context.scene.render.resolution_x),
    }
    img_indx = 0
    num_elevations = 6  # 9 for init gaussians
    for j in range(num_elevations):
        num_imgs = num_views // num_elevations
        for i in range(num_imgs):
            angle = 2 * math.pi * i / num_imgs
            x = object_center.x + radius * math.cos(angle)
            y = object_center.y + radius * math.sin(angle)
            z = (
                object_center.z
                + (j - num_elevations / 2.0) * radius / num_elevations * 1.5
            )  # Adjust this if you want the camera to be above or below the object's center

            camera = create_camera((x, y, z), (0, 0, 0))
            rot_quant = set_camera_look_at(camera, object_center)
            bpy.context.view_layer.update()

            # also prepare transforms.json
            fname = f"images/img_{img_indx}"
            cam2world = np.array(camera.matrix_world)
            transform_dict["frames"].append(
                {"file_path": fname, "transform_matrix": cam2world.tolist()}
            )

            render_scene(
                camera,
                os.path.join(img_output_dir, f"img_{img_indx}.png"),
                my_mesh_name,
            )
            img_indx += 1

    trans_fpath = os.path.join(output_dir, "transforms_train.json")
    import json

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)

    transform_dict["frames"] = transform_dict["frames"][::4]
    trans_fpath = os.path.join(output_dir, "transforms_test.json")

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)


if __name__ == "__main__":
    main()
    # find_material()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/render_obj_external_texture.py
================================================
import bpy
import os
import numpy as np
import math
import sys
import struct
import collections
from mathutils import Matrix, Quaternion, Vector
from scipy.spatial.transform import Rotation


def focal2fov(focal, pixels):
    return 2 * math.atan(pixels / (2 * focal))


def create_camera(location, rotation):
    # Create a new camera
    bpy.ops.object.camera_add(location=location, rotation=rotation)
    return bpy.context.active_object


def set_camera_look_at(camera, target_point):
    # Compute the direction vector from the camera to the target point
    direction = target_point - camera.location
    # Compute the rotation matrix to align the camera's -Z axis to this direction
    rot_quat = direction.to_track_quat("-Z", "Y")
    camera.rotation_euler = rot_quat.to_euler()

    return rot_quat


def setup_alpha_mask(obj_name, pass_index=1):
    # Set the object's pass index
    obj = bpy.data.objects[obj_name]
    obj.pass_index = pass_index

    # Enable the Object Index pass for the active render layer
    bpy.context.view_layer.use_pass_object_index = True

    # Enable 'Use Nodes':
    bpy.context.scene.use_nodes = True
    tree = bpy.context.scene.node_tree

    # Clear default nodes
    for node in tree.nodes:
        tree.nodes.remove(node)

    # Add Render Layers node
    render_layers = tree.nodes.new("CompositorNodeRLayers")

    # Add Composite node (output)
    composite = tree.nodes.new("CompositorNodeComposite")

    # Add ID Mask node
    id_mask = tree.nodes.new("CompositorNodeIDMask")
    id_mask.index = pass_index

    # Add Set Alpha node
    set_alpha = tree.nodes.new("CompositorNodeSetAlpha")

    # Connect nodes
    tree.links.new(render_layers.outputs["Image"], set_alpha.inputs["Image"])
    tree.links.new(render_layers.outputs["IndexOB"], id_mask.inputs[0])
    tree.links.new(id_mask.outputs[0], set_alpha.inputs["Alpha"])
    tree.links.new(set_alpha.outputs["Image"], composite.inputs["Image"])


def render_scene(camera, output_path, mask_name="U3DMesh"):
    bpy.context.scene.render.film_transparent = True

    setup_alpha_mask(mask_name, 1)
    # Set the active camera
    bpy.context.scene.render.image_settings.color_mode = "RGBA"

    bpy.context.scene.camera = camera

    # Set the output path for the render
    bpy.context.scene.render.filepath = output_path

    # Render the scene
    bpy.ops.render.render(write_still=True)


def setup_light():
    # Add first directional light (Sun lamp)
    light_data_1 = bpy.data.lights.new(name="Directional_Light_1", type="SUN")
    light_data_1.energy = 3  # Adjust energy as needed
    light_1 = bpy.data.objects.new(name="Directional_Light_1", object_data=light_data_1)
    bpy.context.collection.objects.link(light_1)
    light_1.location = (20, 20, 20)  # Adjust location as needed
    light_1.rotation_euler = (
        np.radians(45),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_2 = bpy.data.lights.new(name="Directional_Light_2", type="SUN")
    light_data_2.energy = 5  # Adjust energy as needed
    light_2 = bpy.data.objects.new(name="Directional_Light_2", object_data=light_data_2)
    bpy.context.collection.objects.link(light_2)
    light_2.location = (20, -20, 20)  # Adjust location as needed
    light_2.rotation_euler = (
        np.radians(45),
        np.radians(180),
        np.radians(45),
    )  # Adjust rotation for direction

    # Add second directional light (Sun lamp)
    light_data_3 = bpy.data.lights.new(name="Directional_Light_3", type="SUN")
    light_data_3.energy = 3  # Adjust energy as needed
    light_3 = bpy.data.objects.new(name="Directional_Light_3", object_data=light_data_2)
    bpy.context.collection.objects.link(light_3)
    light_3.location = (-20, 20, 20)  # Adjust location as needed
    light_3.rotation_euler = (
        np.radians(-135),
        np.radians(0),
        np.radians(45),
    )  # Adjust rotation for direction


def create_mesh_from_obj(obj_file_path):
    # Clear existing mesh objects in the scene
    bpy.ops.object.select_all(action="DESELECT")
    bpy.ops.object.select_by_type(type="MESH")
    bpy.ops.object.delete()

    bpy.ops.import_scene.obj(filepath=obj_file_path)

    # Assuming the imported object is the active object
    obj = bpy.context.active_object
    num_obj = 0
    for obj in bpy.context.selected_objects:
        print("obj mesh name: ", obj.name, obj.type)
        num_obj += 1
    if num_obj > 2:
        raise ValueError("More than one object in the scene.")
    mesh_objects = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]

    return obj.name, mesh_objects


def get_focal_length(camera_obj, render_resolution):
    """
    Extract the intrinsic matrix from a Blender camera.

    Args:
    - camera_obj: The Blender camera object.
    - render_resolution: Tuple of (width, height) indicating the render resolution.

    Returns:
    - colmap_camera: dict of ["id", "model", "width", "height", "params"]
    """

    # Get the camera data
    cam = camera_obj.data

    # Ensure it's a perspective camera
    if cam.type != "PERSP":
        raise ValueError("Only 'PERSP' camera type is supported.")

    # Image resolution
    width, height = render_resolution

    # Sensor width and height in millimeters
    sensor_width_mm = cam.sensor_width
    sensor_height_mm = cam.sensor_height

    # Calculate the focal length in pixels
    fx = (cam.lens / sensor_width_mm) * width
    fy = (cam.lens / sensor_height_mm) * height

    return fx, fy


def normalize_mesh(transform_meta_path, mesh_objects):
    import json

    if os.path.exists(transform_meta_path):
        with open(transform_meta_path, "r") as f:
            meta_dict = json.load(f)
    # obj = bpy.context.active_object

    for obj in mesh_objects:
        # Ensure the object is in object mode
        # bpy.ops.object.mode_set(mode="OBJECT")

        scale_ = 1.0 / meta_dict["scale"]
        center = Vector(meta_dict["center"])
        # Apply the scale
        print("old scale: ", obj.scale)
        # obj.location -= center
        obj.scale *= scale_


def apply_rotation(mesh_objects):
    for obj in mesh_objects:
        R_np = [[1.0, 0, 0], [0, 0, 1.0], [0, 1.0, 0]]
        R_blender = Matrix(R_np).transposed()

        # Convert the rotation matrix to a quaternion
        quaternion = R_blender.to_quaternion()

        # Set the active object's rotation to this quaternion
        print("rotation", quaternion, obj.rotation_quaternion)
        obj.rotation_quaternion = obj.rotation_quaternion @ quaternion


def get_textures(
    texture_dir="/local/cg/rundi/data/motion_dataset/pirate-flag-animated/source/textures",
):
    # Ensure the "flag" object is selected
    # bpy.context.view_layer.objects.active = bpy.data.objects["flag"]

    obj = bpy.data.objects["flag.001_Plane.001"]

    # Create a new material or get the existing one
    if not obj.data.materials:
        mat = bpy.data.materials.new(name="FBX_Material")
        obj.data.materials.append(mat)
    else:
        mat = obj.data.materials[0]

    # Use nodes for the material
    mat.use_nodes = True
    nodes = mat.node_tree.nodes

    # Clear default nodes
    for node in nodes:
        nodes.remove(node)

    # Add a Principled BSDF shader and connect it to the Material Output
    shader = nodes.new(type="ShaderNodeBsdfPrincipled")
    shader.location = (0, 0)

    output = nodes.new(type="ShaderNodeOutputMaterial")
    output.location = (400, 0)
    mat.node_tree.links.new(shader.outputs["BSDF"], output.inputs["Surface"])

    # Load textures and create the corresponding nodes

    textures = {
        "Base Color": os.path.join(texture_dir, "pirate_flag_albedo.jpg"),
        "Metallic": os.path.join(texture_dir, "pirate_flag_metallic.jpg"),
        "Normal": os.path.join(texture_dir, "pirate_flag_normal.png"),
        "Roughness": os.path.join(texture_dir, "pirate_flag_roughness.jpg"),
    }
    # ... [rest of the script]

    ao_texture = nodes.new(type="ShaderNodeTexImage")
    ao_texture.location = (-400, -200)
    ao_texture.image = bpy.data.images.load(
        filepath=os.path.join(texture_dir, "pirate_flag_AO.jpg")
    )  # Adjust filepath if needed

    mix_rgb = nodes.new(type="ShaderNodeMixRGB")
    mix_rgb.location = (-200, 0)
    mix_rgb.blend_type = "MULTIPLY"
    mix_rgb.inputs[
        0
    ].default_value = 1.0  # Factor to 1 to fully use the multiply operation

    mat.node_tree.links.new(ao_texture.outputs["Color"], mix_rgb.inputs[2])

    for i, (input_name, filename) in enumerate(textures.items()):
        tex_image = nodes.new(type="ShaderNodeTexImage")
        tex_image.location = (-400, i * 200)
        tex_image.image = bpy.data.images.load(
            filepath=filename
        )  # Adjust filepath if needed

        if input_name == "Base Color":
            mat.node_tree.links.new(tex_image.outputs["Color"], mix_rgb.inputs[1])
            mat.node_tree.links.new(mix_rgb.outputs["Color"], shader.inputs[input_name])
        elif input_name == "Normal":
            normal_map_node = nodes.new(type="ShaderNodeNormalMap")
            normal_map_node.location = (-200, i * 200)
            mat.node_tree.links.new(
                tex_image.outputs["Color"], normal_map_node.inputs["Color"]
            )
            mat.node_tree.links.new(
                normal_map_node.outputs["Normal"], shader.inputs["Normal"]
            )
        else:
            mat.node_tree.links.new(
                tex_image.outputs["Color"], shader.inputs[input_name]
            )


def main():
    argv = sys.argv
    argv = argv[argv.index("--") + 1 :]  # get all args after "--"
    print(argv)
    inp_fpx_path = argv[0]  # input mesh path
    output_dir = argv[1]  # output dir
    num_views = int(argv[2])
    radius = 3

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    img_output_dir = os.path.join(output_dir, "images")
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    transform_meta_path = os.path.join(os.path.dirname(inp_fpx_path), "meta.json")

    # Create the 3D mesh in Blender from your data. no normalize
    my_mesh_name, mesh_objects = create_mesh_from_obj(inp_fpx_path)

    normalize_mesh(transform_meta_path, mesh_objects)
    # apply_rotation(mesh_objects)

    get_textures()

    object_center = Vector((0.0, 0.0, 0.5))

    print("look at object center: ", object_center)

    setup_light()
    # Set up rendering parameters
    bpy.context.scene.render.image_settings.file_format = "PNG"
    bpy.context.scene.render.resolution_x = 1080
    bpy.context.scene.render.resolution_y = 720

    # bpy.context.scene.render.resolution_x = 720
    # bpy.context.scene.render.resolution_y = 480

    camera = create_camera((1, 1, 1), (0, 0, 0))
    fx, fy = get_focal_length(
        camera,
        (bpy.context.scene.render.resolution_x, bpy.context.scene.render.resolution_y),
    )

    transform_dict = {
        "frames": [],
        "camera_angle_x": focal2fov(fx, bpy.context.scene.render.resolution_x),
    }
    img_indx = 0
    num_elevations = 6
    for j in range(num_elevations):
        num_imgs = num_views // num_elevations
        for i in range(num_imgs):
            angle = 2 * math.pi * i / num_imgs + math.pi / 6.0
            x = object_center.x + radius * math.cos(angle)
            y = object_center.y + radius * math.sin(angle)
            z = (
                object_center.z + (j - num_elevations / 2.0) * radius / num_elevations
            )  # Adjust this if you want the camera to be above or below the object's center

            camera = create_camera((x, y, z), (0, 0, 0))
            rot_quant = set_camera_look_at(camera, object_center)
            bpy.context.view_layer.update()

            # also prepare transforms.json
            fname = f"images/img_{img_indx}"
            cam2world = np.array(camera.matrix_world)
            transform_dict["frames"].append(
                {"file_path": fname, "transform_matrix": cam2world.tolist()}
            )

            render_scene(
                camera,
                os.path.join(img_output_dir, f"img_{img_indx}.png"),
                my_mesh_name,
            )
            img_indx += 1

    trans_fpath = os.path.join(output_dir, "transforms_train.json")
    import json

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)

    transform_dict["frames"] = transform_dict["frames"][::4]
    trans_fpath = os.path.join(output_dir, "transforms_test.json")

    with open(trans_fpath, "w") as f:
        json.dump(transform_dict, f)


if __name__ == "__main__":
    main()
    # find_material()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/test_colmap_camera.py
================================================
import numpy as np
import os
import sys
import argparse
import collections
import struct
from typing import NamedTuple
import math
import cv2

CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"]
)
Camera = collections.namedtuple("Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]
)
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]
)
CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12),
}
CAMERA_MODEL_IDS = dict(
    [(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]
)
CAMERA_MODEL_NAMES = dict(
    [(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]
)


def qvec2rotmat(qvec):
    return np.array(
        [
            [
                1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
                2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
                2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2],
            ],
            [
                2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
                1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
                2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1],
            ],
            [
                2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
                2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
                1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2,
            ],
        ]
    )


def rotmat2qvec(R):
    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
    K = (
        np.array(
            [
                [Rxx - Ryy - Rzz, 0, 0, 0],
                [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
                [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
                [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz],
            ]
        )
        / 3.0
    )
    eigvals, eigvecs = np.linalg.eigh(K)
    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
    if qvec[0] < 0:
        qvec *= -1
    return qvec


def fov2focal(fov, pixels):
    return pixels / (2 * math.tan(fov / 2))


class Image(BaseImage):
    def qvec2rotmat(self):
        return qvec2rotmat(self.qvec)


def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
    """Read and unpack the next bytes from a binary file.
    :param fid:
    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    :param endian_character: Any of {@, =, <, >, !}
    :return: Tuple of read and unpacked values.
    """
    data = fid.read(num_bytes)
    return struct.unpack(endian_character + format_char_sequence, data)


def read_points3D_text(path):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DText(const std::string& path)
        void Reconstruction::WritePoints3DText(const std::string& path)
    """
    xyzs = None
    rgbs = None
    errors = None
    num_points = 0
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                num_points += 1

    xyzs = np.empty((num_points, 3))
    rgbs = np.empty((num_points, 3))
    errors = np.empty((num_points, 1))
    count = 0
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                xyz = np.array(tuple(map(float, elems[1:4])))
                rgb = np.array(tuple(map(int, elems[4:7])))
                error = np.array(float(elems[7]))
                xyzs[count] = xyz
                rgbs[count] = rgb
                errors[count] = error
                count += 1

    return xyzs, rgbs, errors


def read_points3D_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """

    with open(path_to_model_file, "rb") as fid:
        num_points = read_next_bytes(fid, 8, "Q")[0]

        xyzs = np.empty((num_points, 3))
        rgbs = np.empty((num_points, 3))
        errors = np.empty((num_points, 1))

        for p_id in range(num_points):
            binary_point_line_properties = read_next_bytes(
                fid, num_bytes=43, format_char_sequence="QdddBBBd"
            )
            xyz = np.array(binary_point_line_properties[1:4])
            rgb = np.array(binary_point_line_properties[4:7])
            error = np.array(binary_point_line_properties[7])
            track_length = read_next_bytes(fid, num_bytes=8, format_char_sequence="Q")[
                0
            ]
            track_elems = read_next_bytes(
                fid,
                num_bytes=8 * track_length,
                format_char_sequence="ii" * track_length,
            )
            xyzs[p_id] = xyz
            rgbs[p_id] = rgb
            errors[p_id] = error
    return xyzs, rgbs, errors


def read_intrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    cameras = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                camera_id = int(elems[0])
                model = elems[1]
                assert (
                    model == "PINHOLE"
                ), "While the loader support other types, the rest of the code assumes PINHOLE"
                width = int(elems[2])
                height = int(elems[3])
                params = np.array(tuple(map(float, elems[4:])))
                cameras[camera_id] = Camera(
                    id=camera_id, model=model, width=width, height=height, params=params
                )
    return cameras


def read_extrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    images = {}
    with open(path_to_model_file, "rb") as fid:
        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_reg_images):
            binary_image_properties = read_next_bytes(
                fid, num_bytes=64, format_char_sequence="idddddddi"
            )
            image_id = binary_image_properties[0]
            qvec = np.array(binary_image_properties[1:5])
            tvec = np.array(binary_image_properties[5:8])
            camera_id = binary_image_properties[8]
            image_name = ""
            current_char = read_next_bytes(fid, 1, "c")[0]
            while current_char != b"\x00":  # look for the ASCII 0 entry
                image_name += current_char.decode("utf-8")
                current_char = read_next_bytes(fid, 1, "c")[0]
            num_points2D = read_next_bytes(fid, num_bytes=8, format_char_sequence="Q")[
                0
            ]
            x_y_id_s = read_next_bytes(
                fid,
                num_bytes=24 * num_points2D,
                format_char_sequence="ddq" * num_points2D,
            )
            xys = np.column_stack(
                [tuple(map(float, x_y_id_s[0::3])), tuple(map(float, x_y_id_s[1::3]))]
            )
            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
            images[image_id] = Image(
                id=image_id,
                qvec=qvec,
                tvec=tvec,
                camera_id=camera_id,
                name=image_name,
                xys=xys,
                point3D_ids=point3D_ids,
            )
    return images


def read_intrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    cameras = {}
    with open(path_to_model_file, "rb") as fid:
        num_cameras = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_cameras):
            camera_properties = read_next_bytes(
                fid, num_bytes=24, format_char_sequence="iiQQ"
            )
            camera_id = camera_properties[0]
            model_id = camera_properties[1]
            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
            width = camera_properties[2]
            height = camera_properties[3]
            num_params = CAMERA_MODEL_IDS[model_id].num_params
            params = read_next_bytes(
                fid, num_bytes=8 * num_params, format_char_sequence="d" * num_params
            )
            cameras[camera_id] = Camera(
                id=camera_id,
                model=model_name,
                width=width,
                height=height,
                params=np.array(params),
            )
        assert len(cameras) == num_cameras
    return cameras


def read_extrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    images = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                image_id = int(elems[0])
                qvec = np.array(tuple(map(float, elems[1:5])))
                tvec = np.array(tuple(map(float, elems[5:8])))
                camera_id = int(elems[8])
                image_name = elems[9]
                elems = fid.readline().split()
                xys = np.column_stack(
                    [tuple(map(float, elems[0::3])), tuple(map(float, elems[1::3]))]
                )
                point3D_ids = np.array(tuple(map(int, elems[2::3])))
                images[image_id] = Image(
                    id=image_id,
                    qvec=qvec,
                    tvec=tvec,
                    camera_id=camera_id,
                    name=image_name,
                    xys=xys,
                    point3D_ids=point3D_ids,
                )
    return images


def read_colmap_bin_array(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py

    :param path: path to the colmap binary file.
    :return: nd array with the floating point values in the value
    """
    with open(path, "rb") as fid:
        width, height, channels = np.genfromtxt(
            fid, delimiter="&", max_rows=1, usecols=(0, 1, 2), dtype=int
        )
        fid.seek(0)
        num_delimiter = 0
        byte = fid.read(1)
        while True:
            if byte == b"&":
                num_delimiter += 1
                if num_delimiter >= 3:
                    break
            byte = fid.read(1)
        array = np.fromfile(fid, np.float32)
    array = array.reshape((width, height, channels), order="F")
    return np.transpose(array, (1, 0, 2)).squeeze()


class CameraInfo(NamedTuple):
    uid: int
    R: np.array
    T: np.array
    FovY: np.array
    FovX: np.array
    image: np.array
    image_path: str
    image_name: str
    width: int
    height: int


def focal2fov(focal, pixels):
    return 2 * math.atan(pixels / (2 * focal))


def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
    cam_infos = []
    for idx, key in enumerate(cam_extrinsics):
        sys.stdout.write("\r")
        # the exact output you're looking for:
        sys.stdout.write("Reading camera {}/{}".format(idx + 1, len(cam_extrinsics)))
        sys.stdout.flush()

        extr = cam_extrinsics[key]
        intr = cam_intrinsics[extr.camera_id]
        height = intr.height
        width = intr.width

        uid = intr.id
        R = np.transpose(qvec2rotmat(extr.qvec))
        T = np.array(extr.tvec)

        if intr.model == "SIMPLE_PINHOLE":
            focal_length_x = intr.params[0]
            FovY = focal2fov(focal_length_x, height)
            FovX = focal2fov(focal_length_x, width)
        elif intr.model == "PINHOLE":
            focal_length_x = intr.params[0]
            focal_length_y = intr.params[1]
            FovY = focal2fov(focal_length_y, height)
            FovX = focal2fov(focal_length_x, width)
        else:
            assert (
                False
            ), "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!"

        image_path = os.path.join(images_folder, os.path.basename(extr.name))
        image_name = os.path.basename(image_path).split(".")[0]
        # image = Image.open(image_path)

        cam_info = CameraInfo(
            uid=uid,
            R=R,
            T=T,
            FovY=FovY,
            FovX=FovX,
            image=None,
            image_path=image_path,
            image_name=image_name,
            width=width,
            height=height,
        )
        cam_infos.append(cam_info)
    sys.stdout.write("\n")
    return cam_infos


def read_camera_points(dir_path):
    cameras_extrinsic_file = os.path.join(dir_path, "sparse/0", "images.bin")
    cameras_intrinsic_file = os.path.join(dir_path, "sparse/0", "cameras.bin")
    bin_path = os.path.join(dir_path, "sparse/0", "points3D.bin")
    cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file)
    cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file)

    reading_dir = "images"
    cam_infos_unsorted = readColmapCameras(
        cam_extrinsics=cam_extrinsics,
        cam_intrinsics=cam_intrinsics,
        images_folder=os.path.join(dir_path, reading_dir),
    )

    xyz, rgb, _ = read_points3D_binary(bin_path)

    return cam_infos_unsorted, xyz, rgb


def extract_projection_matrix(cam_info):
    """
    Args:
        cam_info: CameraInfo
    Returns:
        P: [3, 4]
    """
    # change intrinsic to projection matrix

    fovx, fovy = cam_info.FovX, cam_info.FovY
    R, T = np.transpose(cam_info.R), cam_info.T

    # R = np.transpose(R)

    fx, fy = fov2focal(fovx, cam_info.width), fov2focal(fovy, cam_info.height)

    K = np.array([[fx, 0, cam_info.width / 2], [0, fy, cam_info.height / 2], [0, 0, 1]])
    # K[:, 1:3] *= -1.0

    P = K @ np.hstack((R, T.reshape(3, 1)))

    # P[1:3, :] *= -1.0

    return P


def main():
    parser = argparse.ArgumentParser(description="None description")

    parser.add_argument("--input", type=str, help="input dir")

    parser.add_argument("--output", type=str, help="output dir")

    args = parser.parse_args()

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    cam_infos_unsorted, xyz, rgb = read_camera_points(args.input)

    # xyz[:, 1:] *= -1.0
    # xyz *= -1.0
    print(xyz.shape)

    # sort cam_infos_unsorted by uid
    # cam_infos = sorted(cam_infos_unsorted, key=lambda x: x.uid)
    cam_infos = cam_infos_unsorted

    for i in range(10):
        cam_info = cam_infos[i]
        print("name", cam_info.image_name)

        projeciton_matrix = extract_projection_matrix(cam_info)

        img = np.zeros((cam_info.height, cam_info.width, 3), dtype=np.uint8)

        points2d = np.matmul(
            projeciton_matrix[np.newaxis, :, :],
            np.hstack((xyz, np.ones((xyz.shape[0], 1))))[:, :, np.newaxis],
        )
        points2d = points2d[:, :2] / points2d[:, 2:]
        points2d = np.round(points2d).astype(np.int32).squeeze(axis=-1)

        # filter out points that are out of image
        valid_mask = (
            (points2d[:, 0] >= 0)
            & (points2d[:, 0] < cam_info.width)
            & (points2d[:, 1] >= 0)
            & (points2d[:, 1] < cam_info.height)
        )

        points2d = points2d[valid_mask]
        valid_rgb = rgb[valid_mask]

        # img[points2d[:, 1], points2d[:, 0]] = valid_rgb
        # draw circles
        print("num valid points: ", points2d.shape[0])
        for j in range(points2d.shape[0]):
            cv2.circle(
                img,
                (points2d[j, 0], points2d[j, 1]),
                3,
                tuple(valid_rgb[j].astype(np.int32).tolist()),
                -1,
            )

        out_img_path = os.path.join(args.output, f"img{i}.png")

        cv2.imwrite(out_img_path, img)


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/datatools/transform_obj_for_blender.py
================================================
import point_cloud_utils as pcu
import argparse
import os
import json
import numpy as np


def transform_vertex(vertex: np.ndarray, transform_dict):
    """
    Args:
        vertex: shape [n, 3]
    """
    if transform_dict is not None:
        center = np.array(transform_dict["center"])
        scale = transform_dict["scale"]

    else:
        center = np.mean(vertex, axis=0)
        scale = np.max(np.abs(vertex - center))

    new_vertex = (vertex - center) / scale

    return new_vertex, center, scale


def colmap_to_blender_transform(vertex: np.ndarray):
    R_mat = np.array(
        [[1.0, 0, 0], [0, 0, 1.0], [0, 1.0, 0]],
    )
    vertex = R_mat[np.newaxis, :, :] @ vertex[:, :, np.newaxis]

    return vertex.squeeze(axis=-1)


def copy_mtl_file(obj_path, transformed_obj_path):
    mtl_path = obj_path.replace(".obj", ".mtl")

    dummy_mtl_path = transformed_obj_path + ".mtl"
    if os.path.exists(dummy_mtl_path):
        os.remove(dummy_mtl_path)

    if os.path.exists(mtl_path):
        os.system("cp {} {}".format(mtl_path, dummy_mtl_path))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--obj_path", type=str, required=True)

    parser.add_argument("--save_transform", action="store_true", default=False)

    args = parser.parse_args()

    dir_name = os.path.dirname(args.obj_path)
    _name = os.path.basename(dir_name)

    dir_name_father = os.path.dirname(dir_name)

    transformed_dir = os.path.join(dir_name_father, "transformed_{}".format(_name))
    if not os.path.exists(transformed_dir):
        os.makedirs(transformed_dir)

    transformed_obj_path = os.path.join(
        transformed_dir, os.path.basename(args.obj_path)
    )

    if os.path.exists(transformed_obj_path):
        print("Transformed object already exists.")
        # return

    meta_path = os.path.join(dir_name, "meta.json")
    if os.path.exists(meta_path):
        with open(meta_path, "r") as f:
            meta_dict = json.load(f)
    else:
        print("transforming without meta.json")
        meta_dict = None

    mesh = pcu.load_triangle_mesh(args.obj_path)
    vertex = mesh.v
    vertex, center, scale = transform_vertex(vertex, meta_dict)
    vertex = colmap_to_blender_transform(vertex)

    mesh.vertex_data.positions = vertex

    mesh.save(transformed_obj_path)

    copy_mtl_file(args.obj_path, transformed_obj_path)

    if args.save_transform:
        transform_dict = {"center": center.tolist(), "scale": scale}
        with open(os.path.join(dir_name, "meta.json"), "w") as f:
            json.dump(transform_dict, f)

        print("Saved transform dict to {}".format(os.path.join(dir_name, "meta.json")))


if __name__ == "__main__":
    main()


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/builder.py
================================================
from . import gaussian_diffusion as gd
from .respace import SpacedDiffusion, space_timesteps


def create_gaussian_diffusion(
    *,
    steps=1000,
    learn_sigma=False,
    sigma_small=False,
    noise_schedule="linear",
    use_kl=False,
    predict_xstart=False,
    rescale_timesteps=False,
    rescale_learned_sigmas=False,
    timestep_respacing="",
):
    betas = gd.get_named_beta_schedule(noise_schedule, steps)
    if use_kl:
        loss_type = gd.LossType.RESCALED_KL
    elif rescale_learned_sigmas:
        loss_type = gd.LossType.RESCALED_MSE
    else:
        loss_type = gd.LossType.MSE
    if not timestep_respacing:
        timestep_respacing = [steps]
    return SpacedDiffusion(
        use_timesteps=space_timesteps(steps, timestep_respacing),
        betas=betas,
        model_mean_type=(
            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
        ),
        model_var_type=(
            (
                gd.ModelVarType.FIXED_LARGE  # used this. What is the difference?
                if not sigma_small
                else gd.ModelVarType.FIXED_SMALL
            )
            if not learn_sigma
            else gd.ModelVarType.LEARNED_RANGE
        ),
        loss_type=loss_type,
        rescale_timesteps=rescale_timesteps,
    )


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/discretizer.py
================================================
import torch

from sgm.modules.diffusionmodules.discretizer import Discretization


class EDMResShiftedDiscretization(Discretization):
    def __init__(
        self, sigma_min=0.002, sigma_max=80.0, rho=7.0, scale_shift=1.0
    ):
        self.sigma_min = sigma_min
        self.sigma_max = sigma_max
        self.rho = rho
        self.scale_shift = scale_shift

    def get_sigmas(self, n, device="cpu"):
        ramp = torch.linspace(0, 1, n, device=device)
        min_inv_rho = self.sigma_min ** (1 / self.rho)
        max_inv_rho = self.sigma_max ** (1 / self.rho)
        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** self.rho
        sigmas = sigmas * self.scale_shift
        return sigmas


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/draft.py
================================================


import numpy as np

def latent_sds(input_x, schduler, unet, t_range=[0.02, 0.98]):

    # t_range_annel: [0.02, 0.98] => [0.50, 0.98]
    # input_x: # [T, 4, H, W] 


    sigma = schduler.sample_sigma(t_range) # scalar

    noise = randn_like(input_x)

    noised_latent = input_x + sigma * noise

    c, uc = None 
    # x0 prediction. 
    denoised_latent_c, denoised_latent_uc = unet(noised_latent, c, uc)

    w = [1.0, 2.0, 3.0]
    denoised_latent = denoised_latent_uc + w * (denoised_latent_c - denoised_latent_uc)

    sds_grad = (input_x - denoised_latent) / sigma

    loss_sds = MSE(input_x - (input_x - sds_grad).detach())


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/gaussian_diffusion.py
================================================
"""
This code started out as a PyTorch port of Ho et al's diffusion models:
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py

Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
"""

import enum
import math

import numpy as np
import torch as th

from .losses import normal_kl, discretized_gaussian_log_likelihood

# from utils.triplane_util import decompose_featmaps


def mean_flat(tensor):
    """
    Take the mean over all non-batch dimensions.
    """
    return tensor.mean(dim=list(range(1, len(tensor.shape))))


def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
    """
    Get a pre-defined beta schedule for the given name.

    The beta schedule library consists of beta schedules which remain similar
    in the limit of num_diffusion_timesteps.
    Beta schedules may be added, but should not be removed or changed once
    they are committed to maintain backwards compatibility.
    """
    if schedule_name == "linear":
        # Linear schedule from Ho et al, extended to work for any number of
        # diffusion steps.
        scale = 1000 / num_diffusion_timesteps
        beta_start = scale * 0.0001
        beta_end = scale * 0.02
        return np.linspace(
            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
        )
    elif schedule_name == "cosine":
        return betas_for_alpha_bar(
            num_diffusion_timesteps,
            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
        )
    else:
        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function,
    which defines the cumulative product of (1-beta) over time from t = [0,1].

    :param num_diffusion_timesteps: the number of betas to produce.
    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
                      produces the cumulative product of (1-beta) up to that
                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """
    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
    return np.array(betas)


class ModelMeanType(enum.Enum):
    """
    Which type of output the model predicts.
    """

    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
    START_X = enum.auto()  # the model predicts x_0
    EPSILON = enum.auto()  # the model predicts epsilon


class ModelVarType(enum.Enum):
    """
    What is used as the model's output variance.

    The LEARNED_RANGE option has been added to allow the model to predict
    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
    """

    LEARNED = enum.auto()
    FIXED_SMALL = enum.auto()
    FIXED_LARGE = enum.auto()
    LEARNED_RANGE = enum.auto()


class LossType(enum.Enum):
    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
    RESCALED_MSE = (
        enum.auto()
    )  # use raw MSE loss (with RESCALED_KL when learning variances)
    KL = enum.auto()  # use the variational lower-bound
    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB

    def is_vb(self):
        return self == LossType.KL or self == LossType.RESCALED_KL


class GaussianDiffusion:
    """
    Utilities for training and sampling diffusion models.

    Ported directly from here, and then adapted over time to further experimentation.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42

    :param betas: a 1-D numpy array of betas for each diffusion timestep,
                  starting at T and going to 1.
    :param model_mean_type: a ModelMeanType determining what the model outputs.
    :param model_var_type: a ModelVarType determining how variance is output.
    :param loss_type: a LossType determining the loss function to use.
    :param rescale_timesteps: if True, pass floating point timesteps into the
                              model so that they are always scaled like in the
                              original paper (0 to 1000).
    """

    def __init__(
        self,
        *,
        betas,
        model_mean_type,
        model_var_type,
        loss_type,
        rescale_timesteps=False,
    ):
        self.model_mean_type = model_mean_type
        self.model_var_type = model_var_type
        self.loss_type = loss_type
        self.rescale_timesteps = rescale_timesteps

        # Use float64 for accuracy.
        betas = np.array(betas, dtype=np.float64)
        self.betas = betas
        assert len(betas.shape) == 1, "betas must be 1-D"
        assert (betas > 0).all() and (betas <= 1).all()

        self.num_timesteps = int(betas.shape[0])

        alphas = 1.0 - betas
        self.alphas_cumprod = np.cumprod(alphas, axis=0)
        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)

        # calculations for diffusion q(x_t | x_{t-1}) and others
        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)

        # calculations for posterior q(x_{t-1} | x_t, x_0)
        self.posterior_variance = (
            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        # log calculation clipped because the posterior variance is 0 at the
        # beginning of the diffusion chain.
        self.posterior_log_variance_clipped = np.log(
            np.append(self.posterior_variance[1], self.posterior_variance[1:])
        )
        self.posterior_mean_coef1 = (
            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        self.posterior_mean_coef2 = (
            (1.0 - self.alphas_cumprod_prev)
            * np.sqrt(alphas)
            / (1.0 - self.alphas_cumprod)
        )

    def q_mean_variance(self, x_start, t):
        """
        Get the distribution q(x_t | x_0).

        :param x_start: the [N x C x ...] tensor of noiseless inputs.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
        """
        mean = (
            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
        )
        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
        log_variance = _extract_into_tensor(
            self.log_one_minus_alphas_cumprod, t, x_start.shape
        )
        return mean, variance, log_variance

    def q_sample(self, x_start, t, noise=None):
        """
        Diffuse the data for a given number of diffusion steps.

        In other words, sample from q(x_t | x_0).

        :param x_start: the initial data batch.
        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
        :param noise: if specified, the split-out normal noise.
        :return: A noisy version of x_start.
        """
        if noise is None:
            noise = th.randn_like(x_start)
        assert noise.shape == x_start.shape
        return (
            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
            * noise
        )

    def q_posterior_mean_variance(self, x_start, x_t, t):
        """
        Compute the mean and variance of the diffusion posterior:

            q(x_{t-1} | x_t, x_0)

        """
        assert x_start.shape == x_t.shape
        posterior_mean = (
            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
        )
        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
        posterior_log_variance_clipped = _extract_into_tensor(
            self.posterior_log_variance_clipped, t, x_t.shape
        )
        assert (
            posterior_mean.shape[0]
            == posterior_variance.shape[0]
            == posterior_log_variance_clipped.shape[0]
            == x_start.shape[0]
        )
        return posterior_mean, posterior_variance, posterior_log_variance_clipped

    def p_mean_variance(
        self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
    ):
        """
        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
        the initial x, x_0.

        :param model: the model, which takes a signal and a batch of timesteps
                      as input.
        :param x: the [N x C x ...] tensor at time t.
        :param t: a 1-D Tensor of timesteps.
        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample. Applies before
            clip_denoised.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict with the following keys:
                 - 'mean': the model mean output.
                 - 'variance': the model variance output.
                 - 'log_variance': the log of 'variance'.
                 - 'pred_xstart': the prediction for x_0.
        """
        if model_kwargs is None:
            model_kwargs = {}

        B, C = x.shape[:2]
        assert t.shape == (B,)
        model_output = model(x, self._scale_timesteps(t), **model_kwargs)

        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
            assert model_output.shape == (B, C * 2, *x.shape[2:])
            model_output, model_var_values = th.split(model_output, C, dim=1)
            if self.model_var_type == ModelVarType.LEARNED:
                model_log_variance = model_var_values
                model_variance = th.exp(model_log_variance)
            else:
                min_log = _extract_into_tensor(
                    self.posterior_log_variance_clipped, t, x.shape
                )
                max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
                # The model_var_values is [-1, 1] for [min_var, max_var].
                frac = (model_var_values + 1) / 2
                model_log_variance = frac * max_log + (1 - frac) * min_log
                model_variance = th.exp(model_log_variance)
        else:
            model_variance, model_log_variance = {
                # for fixedlarge, we set the initial (log-)variance like so
                # to get a better decoder log likelihood.
                ModelVarType.FIXED_LARGE: (
                    np.append(self.posterior_variance[1], self.betas[1:]),
                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
                ),
                ModelVarType.FIXED_SMALL: (
                    self.posterior_variance,
                    self.posterior_log_variance_clipped,
                ),
            }[self.model_var_type]
            model_variance = _extract_into_tensor(model_variance, t, x.shape)
            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)

        def process_xstart(x):
            if denoised_fn is not None:
                x = denoised_fn(x)
            if clip_denoised:
                return x.clamp(-1, 1)
            return x

        if self.model_mean_type == ModelMeanType.PREVIOUS_X:
            pred_xstart = process_xstart(
                self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
            )
            model_mean = model_output
        elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
            if self.model_mean_type == ModelMeanType.START_X:
                pred_xstart = process_xstart(model_output)
            else:
                pred_xstart = process_xstart(
                    self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
                )
            model_mean, _, _ = self.q_posterior_mean_variance(
                x_start=pred_xstart, x_t=x, t=t
            )
        else:
            raise NotImplementedError(self.model_mean_type)

        assert (
            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
        )
        return {
            "mean": model_mean,
            "variance": model_variance,
            "log_variance": model_log_variance,
            "pred_xstart": pred_xstart,
        }

    def _predict_xstart_from_eps(self, x_t, t, eps):
        assert x_t.shape == eps.shape
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
        )

    def _predict_xstart_from_xprev(self, x_t, t, xprev):
        assert x_t.shape == xprev.shape
        return (  # (xprev - coef2*x_t) / coef1
            _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
            - _extract_into_tensor(
                self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
            )
            * x_t
        )

    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
        return (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - pred_xstart
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

    def _scale_timesteps(self, t):
        if self.rescale_timesteps:
            return t.float() * (1000.0 / self.num_timesteps)
        return t

    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute the mean for the previous step, given a function cond_fn that
        computes the gradient of a conditional log probability with respect to
        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
        condition on y.

        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
        """
        gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
        new_mean = (
            p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
        )
        return new_mean

    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
        """
        Compute what the p_mean_variance output would have been, should the
        model's score function be conditioned by cond_fn.

        See condition_mean() for details on cond_fn.

        Unlike condition_mean(), this instead uses the conditioning strategy
        from Song et al (2020).
        """
        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)

        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
            x, self._scale_timesteps(t), **model_kwargs
        )

        out = p_mean_var.copy()
        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
        out["mean"], _, _ = self.q_posterior_mean_variance(
            x_start=out["pred_xstart"], x_t=x, t=t
        )
        return out

    def p_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
    ):
        """
        Sample x_{t-1} from the model at the given timestep.

        :param model: the model to sample from.
        :param x: the current tensor at x_{t-1}.
        :param t: the value of t, starting at 0 for the first diffusion step.
        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :return: a dict containing the following keys:
                 - 'sample': a random sample from the model.
                 - 'pred_xstart': a prediction of x_0.
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        noise = th.randn_like(x)
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        if cond_fn is not None:
            out["mean"] = self.condition_mean(
                cond_fn, out, x, t, model_kwargs=model_kwargs
            )
        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def p_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
    ):
        """
        Generate samples from the model.

        :param model: the model module.
        :param shape: the shape of the samples, (N, C, H, W).
        :param noise: if specified, the noise from the encoder to sample.
                      Should be of the same shape as `shape`.
        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
        :param denoised_fn: if not None, a function which applies to the
            x_start prediction before it is used to sample.
        :param cond_fn: if not None, this is a gradient function that acts
                        similarly to the model.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param device: if specified, the device to create the samples on.
                       If not specified, use a model parameter's device.
        :param progress: if True, show a tqdm progress bar.
        :return: a non-differentiable batch of samples.
        """
        final = None
        for sample in self.p_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
        ):
            final = sample
        return final["sample"]

    def p_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
    ):
        """
        Generate samples from the model and yield intermediate samples from
        each timestep of diffusion.

        Arguments are the same as p_sample_loop().
        Returns a generator over dicts, where each dict is the return value of
        p_sample().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        if noise is not None:
            img = noise
        else:
            img = th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                out = self.p_sample(
                    model,
                    img,
                    t,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                )
                yield out
                img = out["sample"]

    def ddim_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        eta=0.0,
        y0=None,
        mask=None,
        is_mask_t0=False,
    ):
        """
        Sample x_{t-1} from the model using DDIM.

        Same usage as p_sample().
        """
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        if cond_fn is not None:
            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
        # masked generation
        if y0 is not None and mask is not None:
            assert y0.shape == x.shape
            assert mask.shape == x.shape
            if is_mask_t0:
                out["pred_xstart"] = mask * y0 + (1 - mask) * out["pred_xstart"]
            else:
                nonzero_mask = (
                    (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
                )  # no noise when t == 0
                out["pred_xstart"] = (
                    mask * y0 + (1 - mask) * out["pred_xstart"]
                ) * nonzero_mask + out["pred_xstart"] * (1 - nonzero_mask)

        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])

        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
        sigma = (
            eta
            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
        )
        # Equation 12.
        noise = th.randn_like(x)
        mean_pred = (
            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
            + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
        )
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        sample = mean_pred + nonzero_mask * sigma * noise
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    def ddim_reverse_sample(
        self,
        model,
        x,
        t,
        clip_denoised=True,
        denoised_fn=None,
        model_kwargs=None,
        eta=0.0,
    ):
        """
        Sample x_{t+1} from the model using DDIM reverse ODE.
        """
        assert eta == 0.0, "Reverse ODE only for deterministic path"
        out = self.p_mean_variance(
            model,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        # Usually our model outputs epsilon, but we re-derive it
        # in case we used x_start or x_prev prediction.
        eps = (
            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
            - out["pred_xstart"]
        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)

        # Equation 12. reversed
        mean_pred = (
            out["pred_xstart"] * th.sqrt(alpha_bar_next)
            + th.sqrt(1 - alpha_bar_next) * eps
        )

        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}

    def ddim_sample_loop(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
        y0=None,
        mask=None,
        is_mask_t0=False,
    ):
        """
        Generate samples from the model using DDIM.

        Same usage as p_sample_loop().
        """
        final = None
        for sample in self.ddim_sample_loop_progressive(
            model,
            shape,
            noise=noise,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            cond_fn=cond_fn,
            model_kwargs=model_kwargs,
            device=device,
            progress=progress,
            eta=eta,
            y0=y0,
            mask=mask,
            is_mask_t0=is_mask_t0,
        ):
            final = sample
        return final["sample"]

    def ddim_sample_loop_progressive(
        self,
        model,
        shape,
        noise=None,
        clip_denoised=True,
        denoised_fn=None,
        cond_fn=None,
        model_kwargs=None,
        device=None,
        progress=False,
        eta=0.0,
        y0=None,
        mask=None,
        is_mask_t0=False,
    ):
        """
        Use DDIM to sample from the model and yield intermediate samples from
        each timestep of DDIM.

        Same usage as p_sample_loop_progressive().
        """
        if device is None:
            device = next(model.parameters()).device
        assert isinstance(shape, (tuple, list))
        if noise is not None:
            img = noise
        else:
            img = th.randn(*shape, device=device)
        indices = list(range(self.num_timesteps))[::-1]

        if progress:
            # Lazy import so that we don't depend on tqdm.
            from tqdm.auto import tqdm

            indices = tqdm(indices)

        for i in indices:
            t = th.tensor([i] * shape[0], device=device)
            with th.no_grad():
                out = self.ddim_sample(
                    model,
                    img,
                    t,
                    clip_denoised=clip_denoised,
                    denoised_fn=denoised_fn,
                    cond_fn=cond_fn,
                    model_kwargs=model_kwargs,
                    eta=eta,
                    y0=y0,
                    mask=mask,
                    is_mask_t0=is_mask_t0,
                )
                yield out
                img = out["sample"]

    def _vb_terms_bpd(
        self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
    ):
        """
        Get a term for the variational lower-bound.

        The resulting units are bits (rather than nats, as one might expect).
        This allows for comparison to other papers.

        :return: a dict with the following keys:
                 - 'output': a shape [N] tensor of NLLs or KLs.
                 - 'pred_xstart': the x_0 predictions.
        """
        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
            x_start=x_start, x_t=x_t, t=t
        )
        out = self.p_mean_variance(
            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
        )
        kl = normal_kl(
            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
        )
        kl = mean_flat(kl) / np.log(2.0)

        decoder_nll = -discretized_gaussian_log_likelihood(
            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
        )
        assert decoder_nll.shape == x_start.shape
        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)

        # At the first timestep return the decoder NLL,
        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
        output = th.where((t == 0), decoder_nll, kl)
        return {"output": output, "pred_xstart": out["pred_xstart"]}

    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
        """
        Compute training losses for a single timestep.

        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param t: a batch of timestep indices.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param noise: if specified, the specific Gaussian noise to try to remove.
        :return: a dict with the key "loss" containing a tensor of shape [N].
                 Some mean or variance settings may also have other keys.
        """
        if model_kwargs is None:
            model_kwargs = {}
        if noise is None:
            noise = th.randn_like(x_start)
        x_t = self.q_sample(x_start, t, noise=noise)  # sample

        terms = {}

        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
            raise NotImplementedError
            terms["loss"] = self._vb_terms_bpd(
                model=model,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]
            if self.loss_type == LossType.RESCALED_KL:
                terms["loss"] *= self.num_timesteps
        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
            model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)

            if self.model_var_type in [
                ModelVarType.LEARNED,
                ModelVarType.LEARNED_RANGE,
            ]:
                B, C = x_t.shape[:2]
                assert model_output.shape == (
                    B,
                    C * 2,
                    *x_t.shape[2:],
                )  # why the output channel is doubled? mean and var?
                model_output, model_var_values = th.split(model_output, C, dim=1)
                # Learn the variance using the variational bound, but don't let
                # it affect our mean prediction.
                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
                terms["vb"] = self._vb_terms_bpd(
                    model=lambda *args, r=frozen_out: r,
                    x_start=x_start,
                    x_t=x_t,
                    t=t,
                    clip_denoised=False,
                )["output"]
                if self.loss_type == LossType.RESCALED_MSE:
                    # Divide by 1000 for equivalence with initial implementation.
                    # Without a factor of 1/1000, the VB term hurts the MSE term.
                    terms["vb"] *= self.num_timesteps / 1000.0

            target = {
                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
                    x_start=x_start, x_t=x_t, t=t
                )[0],
                ModelMeanType.START_X: x_start,
                ModelMeanType.EPSILON: noise,
            }[self.model_mean_type]
            assert model_output.shape == target.shape == x_start.shape

            target_tx, target_ty, target_tz = th.split(
                target, target.shape[-1] // 3, dim=-1
            )
            output_tx, output_ty, output_tz = th.split(
                model_output, model_output.shape[-1] // 3, dim=-1
            )

            terms["mse_tx"] = mean_flat((target_tx - output_tx) ** 2)
            terms["mse_ty"] = mean_flat((target_ty - output_ty) ** 2)
            terms["mse_tz"] = mean_flat((target_tz - output_tz) ** 2)
            # terms["mse"] = mean_flat((target - model_output) ** 2)

            if "vb" in terms:
                terms["loss"] = (
                    terms["mse_tx"] + terms["mse_ty"] + terms["mse_tz"] + terms["vb"]
                )
                # terms["loss"] = terms["mse"] + terms["vb"]
            else:
                terms["loss"] = terms["mse_tx"] + terms["mse_ty"] + terms["mse_tz"]
                # terms["loss"] = terms["mse"]
        else:
            raise NotImplementedError(self.loss_type)

        return terms

    def _prior_bpd(self, x_start):
        """
        Get the prior KL term for the variational lower-bound, measured in
        bits-per-dim.

        This term can't be optimized, as it only depends on the encoder.

        :param x_start: the [N x C x ...] tensor of inputs.
        :return: a batch of [N] KL values (in bits), one per batch element.
        """
        batch_size = x_start.shape[0]
        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
        kl_prior = normal_kl(
            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
        )
        return mean_flat(kl_prior) / np.log(2.0)

    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
        """
        Compute the entire variational lower-bound, measured in bits-per-dim,
        as well as other related quantities.

        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param clip_denoised: if True, clip denoised samples.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.

        :return: a dict containing the following keys:
                 - total_bpd: the total variational lower-bound, per batch element.
                 - prior_bpd: the prior term in the lower-bound.
                 - vb: an [N x T] tensor of terms in the lower-bound.
                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
        """
        device = x_start.device
        batch_size = x_start.shape[0]

        vb = []
        xstart_mse = []
        mse = []
        for t in list(range(self.num_timesteps))[::-1]:
            t_batch = th.tensor([t] * batch_size, device=device)
            noise = th.randn_like(x_start)
            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
            # Calculate VLB term at the current timestep
            with th.no_grad():
                out = self._vb_terms_bpd(
                    model,
                    x_start=x_start,
                    x_t=x_t,
                    t=t_batch,
                    clip_denoised=clip_denoised,
                    model_kwargs=model_kwargs,
                )
            vb.append(out["output"])
            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
            mse.append(mean_flat((eps - noise) ** 2))

        vb = th.stack(vb, dim=1)
        xstart_mse = th.stack(xstart_mse, dim=1)
        mse = th.stack(mse, dim=1)

        prior_bpd = self._prior_bpd(x_start)
        total_bpd = vb.sum(dim=1) + prior_bpd
        return {
            "total_bpd": total_bpd,
            "prior_bpd": prior_bpd,
            "vb": vb,
            "xstart_mse": xstart_mse,
            "mse": mse,
        }


def _extract_into_tensor(arr, timesteps, broadcast_shape):
    """
    Extract values from a 1-D numpy array for a batch of indices.

    :param arr: the 1-D numpy array.
    :param timesteps: a tensor of indices into the array to extract.
    :param broadcast_shape: a larger shape of K dimensions with the batch
                            dimension equal to the length of timesteps.
    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
    """
    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
    while len(res.shape) < len(broadcast_shape):
        res = res[..., None]
    return res.expand(broadcast_shape)


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/losses.py
================================================
"""
Helpers for various likelihood-based losses. These are ported from the original
Ho et al. diffusion models codebase:
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
"""

import numpy as np

import torch as th


def normal_kl(mean1, logvar1, mean2, logvar2):
    """
    Compute the KL divergence between two gaussians.

    Shapes are automatically broadcasted, so batches can be compared to
    scalars, among other use cases.
    """
    tensor = None
    for obj in (mean1, logvar1, mean2, logvar2):
        if isinstance(obj, th.Tensor):
            tensor = obj
            break
    assert tensor is not None, "at least one argument must be a Tensor"

    # Force variances to be Tensors. Broadcasting helps convert scalars to
    # Tensors, but it does not work for th.exp().
    logvar1, logvar2 = [
        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
        for x in (logvar1, logvar2)
    ]

    return 0.5 * (
        -1.0
        + logvar2
        - logvar1
        + th.exp(logvar1 - logvar2)
        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
    )


def approx_standard_normal_cdf(x):
    """
    A fast approximation of the cumulative distribution function of the
    standard normal.
    """
    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))


def discretized_gaussian_log_likelihood(x, *, means, log_scales):
    """
    Compute the log-likelihood of a Gaussian distribution discretizing to a
    given image.

    :param x: the target images. It is assumed that this was uint8 values,
              rescaled to the range [-1, 1].
    :param means: the Gaussian mean Tensor.
    :param log_scales: the Gaussian log stddev Tensor.
    :return: a tensor like x of log probabilities (in nats).
    """
    assert x.shape == means.shape == log_scales.shape
    centered_x = x - means
    inv_stdv = th.exp(-log_scales)
    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
    cdf_plus = approx_standard_normal_cdf(plus_in)
    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
    cdf_min = approx_standard_normal_cdf(min_in)
    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
    cdf_delta = cdf_plus - cdf_min
    log_probs = th.where(
        x < -0.999,
        log_cdf_plus,
        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
    )
    assert log_probs.shape == x.shape
    return log_probs


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/resample.py
================================================
"""
Code borrowed from https://github.com/Sin3DM/Sin3DM/blob/9c3ac12a655157469c71632346ebf569354ae7f6/src/diffusion/resample.py
"""

from abc import ABC, abstractmethod

import numpy as np
import torch as th
import torch.distributed as dist


def create_named_schedule_sampler(name, diffusion):
    """
    Create a ScheduleSampler from a library of pre-defined samplers.

    :param name: the name of the sampler.
    :param diffusion: the diffusion object to sample for.
    """
    if name == "uniform":
        return UniformSampler(diffusion)
    elif name == "loss-second-moment":
        return LossSecondMomentResampler(diffusion)
    else:
        raise NotImplementedError(f"unknown schedule sampler: {name}")


class ScheduleSampler(ABC):
    """
    A distribution over timesteps in the diffusion process, intended to reduce
    variance of the objective.

    By default, samplers perform unbiased importance sampling, in which the
    objective's mean is unchanged.
    However, subclasses may override sample() to change how the resampled
    terms are reweighted, allowing for actual changes in the objective.
    """

    @abstractmethod
    def weights(self):
        """
        Get a numpy array of weights, one per diffusion step.

        The weights needn't be normalized, but must be positive.
        """

    def sample(self, batch_size, device):
        """
        Importance-sample timesteps for a batch.

        :param batch_size: the number of timesteps.
        :param device: the torch device to save to.
        :return: a tuple (timesteps, weights):
                 - timesteps: a tensor of timestep indices.
                 - weights: a tensor of weights to scale the resulting losses.
        """
        w = self.weights()
        p = w / np.sum(w)
        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
        indices = th.from_numpy(indices_np).long().to(device)
        weights_np = 1 / (len(p) * p[indices_np])
        weights = th.from_numpy(weights_np).float().to(device)
        return indices, weights


class UniformSampler(ScheduleSampler):
    def __init__(self, diffusion):
        self.diffusion = diffusion
        self._weights = np.ones([diffusion.num_timesteps])

    def weights(self):
        return self._weights


class LossAwareSampler(ScheduleSampler):
    def update_with_local_losses(self, local_ts, local_losses):
        """
        Update the reweighting using losses from a model.

        Call this method from each rank with a batch of timesteps and the
        corresponding losses for each of those timesteps.
        This method will perform synchronization to make sure all of the ranks
        maintain the exact same reweighting.

        :param local_ts: an integer Tensor of timesteps.
        :param local_losses: a 1D Tensor of losses.
        """
        batch_sizes = [
            th.tensor([0], dtype=th.int32, device=local_ts.device)
            for _ in range(dist.get_world_size())
        ]
        dist.all_gather(
            batch_sizes,
            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
        )

        # Pad all_gather batches to be the maximum batch size.
        batch_sizes = [x.item() for x in batch_sizes]
        max_bs = max(batch_sizes)

        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
        dist.all_gather(timestep_batches, local_ts)
        dist.all_gather(loss_batches, local_losses)
        timesteps = [
            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
        ]
        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
        self.update_with_all_losses(timesteps, losses)

    @abstractmethod
    def update_with_all_losses(self, ts, losses):
        """
        Update the reweighting using losses from a model.

        Sub-classes should override this method to update the reweighting
        using losses from the model.

        This method directly updates the reweighting without synchronizing
        between workers. It is called by update_with_local_losses from all
        ranks with identical arguments. Thus, it should have deterministic
        behavior to maintain state across workers.

        :param ts: a list of int timesteps.
        :param losses: a list of float losses, one per timestep.
        """


class LossSecondMomentResampler(LossAwareSampler):
    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
        self.diffusion = diffusion
        self.history_per_term = history_per_term
        self.uniform_prob = uniform_prob
        self._loss_history = np.zeros(
            [diffusion.num_timesteps, history_per_term], dtype=np.float64
        )
        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)

    def weights(self):
        if not self._warmed_up():
            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
        weights /= np.sum(weights)
        weights *= 1 - self.uniform_prob
        weights += self.uniform_prob / len(weights)
        return weights

    def update_with_all_losses(self, ts, losses):
        for t, loss in zip(ts, losses):
            if self._loss_counts[t] == self.history_per_term:
                # Shift out the oldest loss term.
                self._loss_history[t, :-1] = self._loss_history[t, 1:]
                self._loss_history[t, -1] = loss
            else:
                self._loss_history[t, self._loss_counts[t]] = loss
                self._loss_counts[t] += 1

    def _warmed_up(self):
        return (self._loss_counts == self.history_per_term).all()


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/respace.py
================================================
import numpy as np
import torch as th

from .gaussian_diffusion import GaussianDiffusion


def space_timesteps(num_timesteps, section_counts):
    """
    Create a list of timesteps to use from an original diffusion process,
    given the number of timesteps we want to take from equally-sized portions
    of the original process.

    For example, if there's 300 timesteps and the section counts are [10,15,20]
    then the first 100 timesteps are strided to be 10 timesteps, the second 100
    are strided to be 15 timesteps, and the final 100 are strided to be 20.

    If the stride is a string starting with "ddim", then the fixed striding
    from the DDIM paper is used, and only one section is allowed.

    :param num_timesteps: the number of diffusion steps in the original
                          process to divide up.
    :param section_counts: either a list of numbers, or a string containing
                           comma-separated numbers, indicating the step count
                           per section. As a special case, use "ddimN" where N
                           is a number of steps to use the striding from the
                           DDIM paper.
    :return: a set of diffusion steps from the original process to use.
    """
    if isinstance(section_counts, str):
        if section_counts.startswith("ddim"):
            desired_count = int(section_counts[len("ddim") :])
            for i in range(1, num_timesteps):
                if len(range(0, num_timesteps, i)) == desired_count:
                    return set(range(0, num_timesteps, i))
            raise ValueError(
                f"cannot create exactly {num_timesteps} steps with an integer stride"
            )
        section_counts = [int(x) for x in section_counts.split(",")]
    size_per = num_timesteps // len(section_counts)
    extra = num_timesteps % len(section_counts)
    start_idx = 0
    all_steps = []
    for i, section_count in enumerate(section_counts):
        size = size_per + (1 if i < extra else 0)
        if size < section_count:
            raise ValueError(
                f"cannot divide section of {size} steps into {section_count}"
            )
        if section_count <= 1:
            frac_stride = 1
        else:
            frac_stride = (size - 1) / (section_count - 1)
        cur_idx = 0.0
        taken_steps = []
        for _ in range(section_count):
            taken_steps.append(start_idx + round(cur_idx))
            cur_idx += frac_stride
        all_steps += taken_steps
        start_idx += size
    return set(all_steps)


class SpacedDiffusion(GaussianDiffusion):
    """
    A diffusion process which can skip steps in a base diffusion process.

    :param use_timesteps: a collection (sequence or set) of timesteps from the
                          original diffusion process to retain.
    :param kwargs: the kwargs to create the base diffusion process.
    """

    def __init__(self, use_timesteps, **kwargs):
        self.use_timesteps = set(use_timesteps)
        self.timestep_map = []
        self.original_num_steps = len(kwargs["betas"])

        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
        last_alpha_cumprod = 1.0
        new_betas = []
        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
            if i in self.use_timesteps:
                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
                last_alpha_cumprod = alpha_cumprod
                self.timestep_map.append(i)
        kwargs["betas"] = np.array(new_betas)
        super().__init__(**kwargs)

    def p_mean_variance(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)

    def training_losses(
        self, model, *args, **kwargs
    ):  # pylint: disable=signature-differs
        return super().training_losses(self._wrap_model(model), *args, **kwargs)

    def condition_mean(self, cond_fn, *args, **kwargs):
        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)

    def condition_score(self, cond_fn, *args, **kwargs):
        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)

    def _wrap_model(self, model):
        if isinstance(model, _WrappedModel):
            return model
        return _WrappedModel(
            model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
        )

    def _scale_timesteps(self, t):
        # Scaling is done by the wrapped model.
        return t


class _WrappedModel:
    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
        self.model = model
        self.timestep_map = timestep_map
        self.rescale_timesteps = rescale_timesteps
        self.original_num_steps = original_num_steps

    def __call__(self, x, ts, **kwargs):
        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
        new_ts = map_tensor[ts]
        if self.rescale_timesteps:
            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
        return self.model(x, new_ts, **kwargs)


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/sigma_sampling.py
================================================
import torch
from inspect import isfunction

# import sgm


def exists(x):
    return x is not None


def default(val, d):
    if exists(val):
        return val
    return d() if isfunction(d) else d


class EDMSamplingWithResShift:
    def __init__(self, p_mean=-1.2, p_std=1.2, scale_shift=320.0 / 576):
        self.p_mean = p_mean
        self.p_std = p_std
        self.scale_shift = scale_shift

    def __call__(self, n_samples, rand=None):
        log_sigma = self.p_mean + self.p_std * default(rand, torch.randn((n_samples,)))

        sigma = log_sigma.exp() * self.scale_shift
        return sigma


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/sv_diffusion_engine.py
================================================
import math
from contextlib import contextmanager
from typing import Any, Dict, List, Optional, Tuple, Union

import pytorch_lightning as pl
import torch
from omegaconf import ListConfig, OmegaConf
from safetensors.torch import load_file as load_safetensors
from torch.optim.lr_scheduler import LambdaLR

from sgm.modules import UNCONDITIONAL_CONFIG

from sgm.modules.autoencoding.temporal_ae import VideoDecoder
from sgm.modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
from sgm.modules.ema import LitEma
from sgm.util import (
    default,
    disabled_train,
    get_obj_from_str,
    instantiate_from_config,
    log_txt_as_img,
)


class SVDiffusionEngine(pl.LightningModule):
    """
    stable video diffusion engine
    """

    def __init__(
        self,
        network_config,
        denoiser_config,
        first_stage_config,
        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        optimizer_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        scheduler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        network_wrapper: Union[None, str] = None,
        ckpt_path: Union[None, str] = None,
        use_ema: bool = False,
        ema_decay_rate: float = 0.9999,
        scale_factor: float = 1.0,
        disable_first_stage_autocast=False,
        input_key: str = "jpg",
        log_keys: Union[List, None] = None,
        no_cond_log: bool = False,
        compile_model: bool = False,
        en_and_decode_n_samples_a_time: Optional[int] = None,
    ):
        super().__init__()
        self.log_keys = log_keys
        self.input_key = input_key
        self.optimizer_config = default(
            optimizer_config, {"target": "torch.optim.AdamW"}
        )
        model = instantiate_from_config(network_config)
        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
            model, compile_model=compile_model
        )

        # TODO
        # add lora to the model if lora input
        # change forward
        # print(self.model)
        for name, child in self.model.named_modules():
            # print(name, "named child")
            pass

        self.denoiser = instantiate_from_config(denoiser_config)
        self.sampler = (
            instantiate_from_config(sampler_config)
            if sampler_config is not None
            else None
        )
        self.conditioner = instantiate_from_config(
            default(conditioner_config, UNCONDITIONAL_CONFIG)
        )
        self.scheduler_config = scheduler_config
        self._init_first_stage(first_stage_config)

        self.loss_fn = (
            instantiate_from_config(loss_fn_config)
            if loss_fn_config is not None
            else None
        )

        self.use_ema = use_ema
        if self.use_ema:
            self.model_ema = LitEma(self.model, decay=ema_decay_rate)
            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")

        self.scale_factor = scale_factor
        self.disable_first_stage_autocast = disable_first_stage_autocast
        self.no_cond_log = no_cond_log

        if ckpt_path is not None:
            self.init_from_ckpt(ckpt_path)

        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time

    def init_from_ckpt(
        self,
        path: str,
    ) -> None:
        print("init svd engine from", path)
        if path.endswith("ckpt"):
            sd = torch.load(path, map_location="cpu")["state_dict"]
        elif path.endswith("safetensors"):
            sd = load_safetensors(path)
        elif path.endswith("bin"):
            sd = torch.load(path, map_location="cpu")
        else:
            raise NotImplementedError

        missing, unexpected = self.load_state_dict(sd, strict=False)
        print(
            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
        )
        if len(missing) > 0:
            print(f"Missing Keys: {missing}")
        if len(unexpected) > 0:
            print(f"Unexpected Keys: {unexpected}")

    def _init_first_stage(self, config):
        model = instantiate_from_config(config).eval()
        model.train = disabled_train
        for param in model.parameters():
            param.requires_grad = False
        self.first_stage_model = model

    def get_input(self, batch):
        # assuming unified data format, dataloader returns a dict.
        # image tensors should be scaled to -1 ... 1 and in bchw format
        return batch[self.input_key]

    @torch.no_grad()
    def decode_first_stage(self, z):
        z = 1.0 / self.scale_factor * z
        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])

        n_rounds = math.ceil(z.shape[0] / n_samples)
        all_out = []
        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
            for n in range(n_rounds):
                if isinstance(self.first_stage_model.decoder, VideoDecoder):
                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
                else:
                    kwargs = {}
                out = self.first_stage_model.decode(
                    z[n * n_samples : (n + 1) * n_samples], **kwargs
                )
                all_out.append(out)
        out = torch.cat(all_out, dim=0)
        return out

    @torch.no_grad()
    def encode_first_stage(self, x):
        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
        n_rounds = math.ceil(x.shape[0] / n_samples)
        all_out = []
        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
            for n in range(n_rounds):
                out = self.first_stage_model.encode(
                    x[n * n_samples : (n + 1) * n_samples]
                )
                all_out.append(out)
        z = torch.cat(all_out, dim=0)
        z = self.scale_factor * z
        return z

    def forward(self, batch, training=True):
        assert training, "DiffusionEngine forward function is only for training."

        x = self.get_input(batch)
        x = self.encode_first_stage(x)
        batch["global_step"] = self.global_step

        x.requires_grad = True
        loss = self.loss_fn(self.model, self.denoiser, self.conditioner, x, batch)
        loss_mean = loss.mean()
        loss_dict = {"loss": loss_mean}
        return loss_mean, loss_dict

    def shared_step(self, batch: Dict) -> Any:
        x = self.get_input(batch)
        x = self.encode_first_stage(x)
        batch["global_step"] = self.global_step
        loss, loss_dict = self(x, batch)
        return loss, loss_dict

    def training_step(self, batch, batch_idx):
        loss, loss_dict = self.shared_step(batch)

        self.log_dict(
            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=False
        )

        self.log(
            "global_step",
            self.global_step,
            prog_bar=True,
            logger=True,
            on_step=True,
            on_epoch=False,
        )

        if self.scheduler_config is not None:
            lr = self.optimizers().param_groups[0]["lr"]
            self.log(
                "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False
            )

        return loss

    def on_train_start(self, *args, **kwargs):
        if self.sampler is None or self.loss_fn is None:
            raise ValueError("Sampler and loss function need to be set for training.")

    def on_train_batch_end(self, *args, **kwargs):
        if self.use_ema:
            self.model_ema(self.model)

    @contextmanager
    def ema_scope(self, context=None):
        if self.use_ema:
            self.model_ema.store(self.model.parameters())
            self.model_ema.copy_to(self.model)
            if context is not None:
                print(f"{context}: Switched to EMA weights")
        try:
            yield None
        finally:
            if self.use_ema:
                self.model_ema.restore(self.model.parameters())
                if context is not None:
                    print(f"{context}: Restored training weights")

    def instantiate_optimizer_from_config(self, params, lr, cfg):
        return get_obj_from_str(cfg["target"])(
            params, lr=lr, **cfg.get("params", dict())
        )

    def configure_optimizers(self):
        lr = self.learning_rate
        params = list(self.model.parameters())
        for embedder in self.conditioner.embedders:
            if embedder.is_trainable:
                params = params + list(embedder.parameters())
        opt = self.instantiate_optimizer_from_config(params, lr, self.optimizer_config)
        if self.scheduler_config is not None:
            scheduler = instantiate_from_config(self.scheduler_config)
            print("Setting up LambdaLR scheduler...")
            scheduler = [
                {
                    "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule),
                    "interval": "step",
                    "frequency": 1,
                }
            ]
            return [opt], scheduler
        return opt

    def get_trainable_parameters(self):
        params = list(self.model.parameters())
        embedder_params = []
        for embedder in self.conditioner.embedders:
            if embedder.is_trainable:
                embedder_params = embedder_params + list(embedder.parameters())
        print(
            "number of trainable parameters: {} - from embeder: {} ".format(
                len(params), len(embedder_params)
            )
        )
        params = params + embedder_params
        return params

    @torch.no_grad()
    def sample(
        self,
        cond: Dict,
        uc: Union[Dict, None] = None,
        batch_size: int = 16,
        shape: Union[None, Tuple, List] = None,
        **kwargs,
    ):
        randn = torch.randn(batch_size, *shape).to(self.device)

        denoiser = lambda input, sigma, c: self.denoiser(
            self.model, input, sigma, c, **kwargs
        )
        samples = self.sampler(denoiser, randn, cond, uc=uc)
        return samples

    @torch.no_grad()
    def log_conditionings(self, batch: Dict, n: int) -> Dict:
        """
        Defines heuristics to log different conditionings.
        These can be lists of strings (text-to-image), tensors, ints, ...
        """
        image_h, image_w = batch[self.input_key].shape[2:]
        log = dict()

        for embedder in self.conditioner.embedders:
            if (
                (self.log_keys is None) or (embedder.input_key in self.log_keys)
            ) and not self.no_cond_log:
                x = batch[embedder.input_key][:n]
                if isinstance(x, torch.Tensor):
                    if x.dim() == 1:
                        # class-conditional, convert integer to string
                        x = [str(x[i].item()) for i in range(x.shape[0])]
                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 4)
                    elif x.dim() == 2:
                        # size and crop cond and the like
                        x = [
                            "x".join([str(xx) for xx in x[i].tolist()])
                            for i in range(x.shape[0])
                        ]
                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
                    else:
                        raise NotImplementedError()
                elif isinstance(x, (List, ListConfig)):
                    if isinstance(x[0], str):
                        # strings
                        xc = log_txt_as_img((image_h, image_w), x, size=image_h // 20)
                    else:
                        raise NotImplementedError()
                else:
                    raise NotImplementedError()
                log[embedder.input_key] = xc
        return log

    @torch.no_grad()
    def log_images(
        self,
        batch: Dict,
        N: int = 8,
        sample: bool = True,
        ucg_keys: List[str] = None,
        **kwargs,
    ) -> Dict:
        conditioner_input_keys = [e.input_key for e in self.conditioner.embedders]
        if ucg_keys:
            assert all(map(lambda x: x in conditioner_input_keys, ucg_keys)), (
                "Each defined ucg key for sampling must be in the provided conditioner input keys,"
                f"but we have {ucg_keys} vs. {conditioner_input_keys}"
            )
        else:
            ucg_keys = conditioner_input_keys
        log = dict()

        x = self.get_input(batch)

        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            force_uc_zero_embeddings=ucg_keys
            if len(self.conditioner.embedders) > 0
            else [],
        )

        sampling_kwargs = {}

        N = min(x.shape[0], N)
        x = x.to(self.device)[:N]
        log["inputs"] = x
        z = self.encode_first_stage(x)
        log["reconstructions"] = self.decode_first_stage(z)
        log.update(self.log_conditionings(batch, N))

        for k in c:
            if isinstance(c[k], torch.Tensor):
                c[k], uc[k] = map(lambda y: y[k][:N].to(self.device), (c, uc))

        if sample:
            with self.ema_scope("Plotting"):
                samples = self.sample(
                    c, shape=z.shape[1:], uc=uc, batch_size=N, **sampling_kwargs
                )
            samples = self.decode_first_stage(samples)
            log["samples"] = samples
        return log


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/svd_conditioner.py
================================================
"""
Modified from https://github.com/Stability-AI/generative-models/blob/main/sgm/modules/encoders/modules.py
"""
import math
from contextlib import nullcontext
from functools import partial
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.nn as nn
from einops import rearrange, repeat
import numpy as np
from omegaconf import ListConfig
from sgm.util import append_dims, instantiate_from_config
from sgm.modules.encoders.modules import GeneralConditioner
import random


class SVDConditioner(GeneralConditioner):
    OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
    KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}

    def __init__(self, emb_models: Union[List, ListConfig]):
        super().__init__(emb_models)
        
    def forward(
        self, batch: Dict, force_zero_embeddings: Optional[List] = None
    ) -> Dict:
        output = dict()
        
        if force_zero_embeddings is None:
            force_zero_embeddings = []
        
            if self.training:
                img_ucg_rate = 0
                for embedder in self.embedders:
                    if embedder.input_key == "cond_frames_without_noise":
                        img_ucg_rate = embedder.ucg_rate 
                        break
                if img_ucg_rate > 0:
                    if random.random() < img_ucg_rate:
                        force_zero_embeddings.append("cond_frames_without_noise")
                        force_zero_embeddings.append("cond_frames")

        for embedder in self.embedders:
            embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
            with embedding_context():
                if hasattr(embedder, "input_key") and (embedder.input_key is not None):
                    if embedder.legacy_ucg_val is not None:
                        batch = self.possibly_get_ucg_val(embedder, batch)
                    emb_out = embedder(batch[embedder.input_key])
                elif hasattr(embedder, "input_keys"):
                    emb_out = embedder(*[batch[k] for k in embedder.input_keys])
            assert isinstance(
                emb_out, (torch.Tensor, list, tuple)
            ), f"encoder outputs must be tensors or a sequence, but got {type(emb_out)}"
            if not isinstance(emb_out, (list, tuple)):
                emb_out = [emb_out]
            for emb in emb_out:
                out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
                
                if (
                    hasattr(embedder, "input_key")
                    and embedder.input_key in force_zero_embeddings
                ):
                    emb = torch.zeros_like(emb)
                if out_key in output:
                    output[out_key] = torch.cat(
                        (output[out_key], emb), self.KEY2CATDIM[out_key]
                    )
                else:
                    output[out_key] = emb
        return output
        

================================================
FILE: projects/uncleaned_train/motionrep/diffusion/svd_sds_engine.py
================================================
import math
from contextlib import contextmanager
from typing import Any, Dict, List, Optional, Tuple, Union

import pytorch_lightning as pl
import torch
from omegaconf import ListConfig, OmegaConf
from safetensors.torch import load_file as load_safetensors

from sgm.modules import UNCONDITIONAL_CONFIG

from sgm.modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
from sgm.util import (
    default,
    disabled_train,
    get_obj_from_str,
    instantiate_from_config,
    append_dims,
)

from motionrep.utils.svd_helpper import (
    get_batch,
    get_unique_embedder_keys_from_conditioner,
)
from einops import rearrange, repeat

import torch.nn.functional as F

import numpy as np


class SVDSDSEngine(pl.LightningModule):
    """
    stable video diffusion engine
    """

    def __init__(
        self,
        network_config,
        denoiser_config,
        first_stage_config,
        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        discretization_config: Union[None, Dict, ListConfig, OmegaConf] = None,  # Added
        network_wrapper: Union[None, str] = None,
        ckpt_path: Union[None, str] = None,
        scale_factor: float = 1.0,
        disable_first_stage_autocast=False,
        input_key: str = "jpg",
        compile_model: bool = False,
        en_and_decode_n_samples_a_time: Optional[int] = None,
    ):
        super().__init__()
        self.input_key = input_key

        model = instantiate_from_config(network_config)
        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
            model, compile_model=compile_model
        )
        self.model.eval()

        self.denoiser = instantiate_from_config(denoiser_config)
        assert self.denoiser is not None, "need denoiser"

        self.sampler = (
            instantiate_from_config(sampler_config)
            if sampler_config is not None
            else None
        )
        self.conditioner = instantiate_from_config(
            default(conditioner_config, UNCONDITIONAL_CONFIG)
        )

        self._init_first_stage(first_stage_config)

        self.loss_fn = (
            instantiate_from_config(loss_fn_config)
            if loss_fn_config is not None
            else None
        )

        self.scale_factor = scale_factor
        self.disable_first_stage_autocast = disable_first_stage_autocast

        if ckpt_path is not None:
            self.init_from_ckpt(ckpt_path)

        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time

        assert discretization_config is not None, "need discretizer"
        self.discretizer = instantiate_from_config(discretization_config)

        # [1000]
        sigmas_all = self.discretizer.get_sigmas(1000)
        self.register_buffer("sigmas_all", sigmas_all)

    def init_from_ckpt(
        self,
        path: str,
    ) -> None:
        print("init svd engine from", path)
        if path.endswith("ckpt"):
            sd = torch.load(path, map_location="cpu")["state_dict"]
        elif path.endswith("safetensors"):
            sd = load_safetensors(path)
        elif path.endswith("bin"):
            sd = torch.load(path, map_location="cpu")
        else:
            raise NotImplementedError

        missing, unexpected = self.load_state_dict(sd, strict=False)
        print(
            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
        )
        if len(missing) > 0:
            print(f"Missing Keys: {missing}")
        if len(unexpected) > 0:
            # print(f"Unexpected Keys: {unexpected}")
            pass

    def _init_first_stage(self, config):
        model = instantiate_from_config(config).eval()
        model.train = disabled_train
        for param in model.parameters():
            param.requires_grad = False
        self.first_stage_model = model

        del self.first_stage_model.decoder
        self.first_stage_model.decoder = None

    def get_input(self, batch):
        # assuming unified data format, dataloader returns a dict.
        # image tensors should be scaled to -1 ... 1 and in bchw format
        return batch[self.input_key]

    def encode_first_stage(self, x):
        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
        n_rounds = math.ceil(x.shape[0] / n_samples)
        all_out = []
        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
            for n in range(n_rounds):
                out = self.first_stage_model.encode(
                    x[n * n_samples : (n + 1) * n_samples]
                )
                all_out.append(out)
        z = torch.cat(all_out, dim=0)
        z = self.scale_factor * z
        return z

    def forward(self, batch, sample_time_range=[0.02, 0.98]):
        """
        Args:
            batch["jpg"]: [BT, 3, H, W]. Videos range in
                [-1, 1]? TODO Dec 16. Check
            batch["cond_image"]: [B, 3, H, W]. in [-1, 1]?
                TODO: check shape
        """
        x = self.get_input(batch)  # [BT, 3, H, W]
        T = batch["num_video_frames"]
        batch_size = x.shape[0] // T
        z = self.encode_first_stage(x)  # [BT, C, H_latent, W_latent]
        batch["global_step"] = self.global_step

        with torch.no_grad():
            sds_grad = self.edm_sds(z, batch, sample_time_range)
            target = (z - sds_grad).detach()

        loss_sds = 0.5 * F.mse_loss(z, target, reduction="sum") / batch_size
        log_loss_dict = {
            "loss_sds_video": loss_sds.item(),
            "sds_delta_norm": sds_grad.norm().item(),
        }

        return loss_sds, log_loss_dict

    def forward_with_encoder_chunk(
        self, batch, chunk_size=2, sample_time_range=[0.02, 0.98]
    ):
        with torch.no_grad():
            x = self.get_input(batch)  # [BT, 3, H, W]
            T = batch["num_video_frames"]
            batch_size = x.shape[0] // T
            z = self.encode_first_stage(x)  # [BT, C, H_latent, W_latent]
            batch["global_step"] = self.global_step
            sds_grad, denoised_latent = self.edm_sds(z, batch, sample_time_range)

        num_chunks = math.ceil(z.shape[0] / chunk_size)

        for n in range(num_chunks):
            end_ind = min((n + 1) * chunk_size, z.shape[0])
            x_chunk = x[n * chunk_size : end_ind]
            z_chunk_recompute = self.encode_first_stage(x_chunk)

            target_chunk = (
                z_chunk_recompute - sds_grad[n * chunk_size : end_ind]
            ).detach()

            this_chunk_size = x_chunk.shape[0]
            assert this_chunk_size > 0
            # loss_sds_chunk = (
            #     0.5
            #     * F.mse_loss(z_chunk_recompute, target_chunk, reduction="mean")
            #     * this_chunk_size
            #     / z.shape[0]
            #     / batch_size
            # )
            loss_sds_chunk = 0.5 * F.mse_loss(z_chunk_recompute, target_chunk, reduction="sum") / batch_size

            loss_sds_chunk.backward()

        with torch.no_grad():
            target = (z - sds_grad).detach()
            loss_sds = 0.5 * F.mse_loss(z, target, reduction="sum") / batch_size
            log_loss_dict = {
                "latent_loss_sds": loss_sds.item(),
                "latent_sds_norm": sds_grad.norm().item(),
                "latent_sds_max": sds_grad.max().item(),
                "latent_sds_mean": sds_grad.mean().item(),
            }

            video_space_sds_grad = x.grad

        return video_space_sds_grad, log_loss_dict, denoised_latent

    @torch.no_grad()
    def edm_sds(self, input_x, extra_input, sample_time_range=[0.02, 0.98]):
        """
        Args:
            input_x: [BT, C, H, W] in latent
            extra_input: dict
                "fps_id": [B]
                "motion_bucket_id": [B]
                "cond_aug": [B]
                "cond_frames_without_noise": [B, C, H, W]
                "cond_frames": [B, C, H, W]
            sample_time_range: [t_min, t_max]
        """

        # step-1: prepare inputs
        num_frames = extra_input["num_video_frames"]
        batch_size = input_x.shape[0] // num_frames
        device = input_x.device
        # video = video.contiguous()

        extra_input["num_video_frames"] = num_frames

        # prepare c and uc

        batch, batch_uc = get_batch(
            get_unique_embedder_keys_from_conditioner(self.conditioner),
            extra_input,
            [1, num_frames],
            T=num_frames,
            device=device,
        )

        # keys would be be ['crossattn', 'vector', 'concat']
        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            batch_uc=batch_uc,
            force_uc_zero_embeddings=[
                "cond_frames",
                "cond_frames_without_noise",
            ],
        )

        for k in ["crossattn", "concat"]:
            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

        # after this should be
        # crossattn [14, 1, 1024];  vector [14, 768]; concat [14, 4, 72, 128]
        additional_model_inputs = {}
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            int(2 * batch_size), num_frames
        ).to(device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        # step-2: sample t and sigmas, then noise
        sampled_t = np.random.randint(
            low=int(sample_time_range[0] * self.sigmas_all.shape[0]),
            high=int(sample_time_range[1] * self.sigmas_all.shape[0]),
            size=(batch_size),
        ).tolist() # list of index time t [B]
        sigmas = self.sigmas_all[sampled_t]

        # sigmas = self.loss_fn.sigma_sampler(batch_size).to(input_x)
        sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
        sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

        noise = torch.randn_like(input_x)  # [BT, C, H, W]

        sigmas_bc = append_dims(sigmas, input_x.ndim)  # [14, 1, 1, 1]

        noised_input = self.loss_fn.get_noised_input(
            sigmas_bc, noise, input_x
        )  # [BT, C, H, W]

        # step-3: prepare conditional and unconditional inputs
        # [2BT, C, H, W], [2BT]
        bathced_xt, bathced_sigmas, bathched_c = self.sampler.guider.prepare_inputs(
            noised_input, sigmas, c, uc
        )
        # bathched_c["crossattn"] => [2BT, 1, C] ;   bathched_c["concat"] => [2BT, C, H, W]; bathched_c["vector"] => [2BT, C_feat]

        # output shape [2BT, C, H, W]
        denoised = self.denoiser(
            self.model,
            bathced_xt,
            bathced_sigmas,
            bathched_c,
            **additional_model_inputs,
        )

        # step-4: cfg guidance and compute sds_grad
        # [BT, C, H, W]
        denoised = self.sampler.guider(denoised, bathced_sigmas)

        sds_grad = (input_x - denoised) / sigmas_bc        

        return sds_grad, denoised

    @torch.no_grad()
    def edm_sds_multistep(self, input_x, extra_input, sample_time_range=[0.02, 0.84], num_step=4, total_steps=25):
        """
        From t = 20 sample to t = 980. 
        Args:
            input_x: [BT, C, H, W] in latent
            extra_input: dict
                "fps_id": [B]
                "motion_bucket_id": [B]
                "cond_aug": [B]
                "cond_frames_without_noise": [B, C, H, W]
                "cond_frames": [B, C, H, W]
            sample_time_range: [t_min, t_max]
        """

        # step-1: prepare inputs
        num_frames = extra_input["num_video_frames"]
        batch_size = input_x.shape[0] // num_frames
        device = input_x.device
        # video = video.contiguous()

        extra_input["num_video_frames"] = num_frames

        # prepare c and uc

        batch, batch_uc = get_batch(
            get_unique_embedder_keys_from_conditioner(self.conditioner),
            extra_input,
            [1, num_frames],
            T=num_frames,
            device=device,
        )

        # keys would be be ['crossattn', 'vector', 'concat']
        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            batch_uc=batch_uc,
            force_uc_zero_embeddings=[
                "cond_frames",
                "cond_frames_without_noise",
            ],
        )

        for k in ["crossattn", "concat"]:
            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

        # after this should be
        # crossattn [14, 1, 1024];  vector [14, 768]; concat [14, 4, 72, 128]
        additional_model_inputs = {}
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            int(2 * batch_size), num_frames
        ).to(device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        # step-2: sample t and sigmas, then noise
        sampled_t = np.random.randint(
            low=int(sample_time_range[0] * self.sigmas_all.shape[0]),
            high=int(sample_time_range[1] * self.sigmas_all.shape[0]),
            size=(batch_size),
        ) # np.array of index time t [B]

        step_stride = len(self.sigmas_all) // total_steps

        sigma_sum = 0.0
        for i in range(num_step):
            sampled_t += step_stride * i 
            sampled_t = np.clip(sampled_t, 0, len(self.sigmas_all) - 2)
            

            # [B]
            sigmas = self.sigmas_all[sampled_t]

            # sigmas = self.loss_fn.sigma_sampler(batch_size).to(input_x)
            sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
            sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

            sigmas_bc = append_dims(sigmas, input_x.ndim)  # [14, 1, 1, 1]

            if i == 0:

                noise = torch.randn_like(input_x)  # [BT, C, H, W]

                noised_input = self.loss_fn.get_noised_input(
                    sigmas_bc, noise, input_x
                )  # [BT, C, H, W]
            else:
                # dt is negative
                dt = append_dims(sigmas - prev_sigmas, input_x.ndim)
                
                dx = (noised_input - denoised) / append_dims(prev_sigmas, input_x.ndim)
                noised_input = noised_input + dt * dx

            denoised = self.sampler_step(sigmas, noised_input, c, uc, 
                                         num_frames=num_frames, additional_model_inputs=additional_model_inputs)
            prev_sigmas = sigmas
            sigma_sum += sigmas_bc
            
        # TODO, so many sigmas, which to use?
        # sds_grad = (input_x - denoised) / sigmas_bc
        sds_grad = (input_x - denoised) / sigma_sum

        return sds_grad, denoised
    

    def sampler_step(self, sigma, noised_input, c, uc=None, num_frames=None, additional_model_inputs=None):
        
        # step-3: prepare conditional and unconditional inputs
        # [2BT, C, H, W], [2BT]
        bathced_xt, bathced_sigmas, bathched_c = self.sampler.guider.prepare_inputs(
            noised_input, sigma, c, uc
        )
        # bathched_c["crossattn"] => [2BT, 1, C] ;   bathched_c["concat"] => [2BT, C, H, W]; bathched_c["vector"] => [2BT, C_feat]

        # output shape [2BT, C, H, W]
        denoised = self.denoiser(
            self.model,
            bathced_xt,
            bathced_sigmas,
            bathched_c,
            **additional_model_inputs,
        )

        # step-4: cfg guidance and compute sds_grad
        # [BT, C, H, W]
        denoised = self.sampler.guider(denoised, bathced_sigmas)

        return denoised


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/svd_sds_engine_backup.py
================================================
import math
from contextlib import contextmanager
from typing import Any, Dict, List, Optional, Tuple, Union

import pytorch_lightning as pl
import torch
from omegaconf import ListConfig, OmegaConf
from safetensors.torch import load_file as load_safetensors

from sgm.modules import UNCONDITIONAL_CONFIG

from sgm.modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
from sgm.util import (
    default,
    disabled_train,
    get_obj_from_str,
    instantiate_from_config,
    append_dims,
)

from motionrep.utils.svd_helpper import (
    get_batch,
    get_unique_embedder_keys_from_conditioner,
)
from einops import rearrange, repeat

import torch.nn.functional as F


class SVDSDSEngine(pl.LightningModule):
    """
    stable video diffusion engine
    """

    def __init__(
        self,
        network_config,
        denoiser_config,
        first_stage_config,
        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        network_wrapper: Union[None, str] = None,
        ckpt_path: Union[None, str] = None,
        scale_factor: float = 1.0,
        disable_first_stage_autocast=False,
        input_key: str = "jpg",
        compile_model: bool = False,
        en_and_decode_n_samples_a_time: Optional[int] = None,
    ):
        super().__init__()
        self.input_key = input_key

        model = instantiate_from_config(network_config)
        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
            model, compile_model=compile_model
        )
        self.model.eval()

        self.denoiser = instantiate_from_config(denoiser_config)
        self.sampler = (
            instantiate_from_config(sampler_config)
            if sampler_config is not None
            else None
        )
        self.conditioner = instantiate_from_config(
            default(conditioner_config, UNCONDITIONAL_CONFIG)
        )

        self._init_first_stage(first_stage_config)

        self.loss_fn = (
            instantiate_from_config(loss_fn_config)
            if loss_fn_config is not None
            else None
        )

        self.scale_factor = scale_factor
        self.disable_first_stage_autocast = disable_first_stage_autocast

        if ckpt_path is not None:
            self.init_from_ckpt(ckpt_path)

        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time

    def init_from_ckpt(
        self,
        path: str,
    ) -> None:
        print("init svd engine from", path)
        if path.endswith("ckpt"):
            sd = torch.load(path, map_location="cpu")["state_dict"]
        elif path.endswith("safetensors"):
            sd = load_safetensors(path)
        elif path.endswith("bin"):
            sd = torch.load(path, map_location="cpu")
        else:
            raise NotImplementedError

        missing, unexpected = self.load_state_dict(sd, strict=False)
        print(
            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
        )
        if len(missing) > 0:
            print(f"Missing Keys: {missing}")
        if len(unexpected) > 0:
            # print(f"Unexpected Keys: {unexpected}")
            pass

    def _init_first_stage(self, config):
        model = instantiate_from_config(config).eval()
        model.train = disabled_train
        for param in model.parameters():
            param.requires_grad = False
        self.first_stage_model = model

        del self.first_stage_model.decoder
        self.first_stage_model.decoder = None

    def get_input(self, batch):
        # assuming unified data format, dataloader returns a dict.
        # image tensors should be scaled to -1 ... 1 and in bchw format
        return batch[self.input_key]

    def encode_first_stage(self, x):
        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
        n_rounds = math.ceil(x.shape[0] / n_samples)
        all_out = []
        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
            for n in range(n_rounds):
                out = self.first_stage_model.encode(
                    x[n * n_samples : (n + 1) * n_samples]
                )
                all_out.append(out)
        z = torch.cat(all_out, dim=0)
        z = self.scale_factor * z
        return z

    def forward(self, batch, training=True):
        assert training, "DiffusionEngine forward function is only for training."

        x = self.get_input(batch)  # [BT, 3, H, W]
        T = batch["num_video_frames"]
        batch_size = x.shape[0] // T
        z = self.encode_first_stage(x)  # [BT, C, H_latent, W_latent]
        batch["global_step"] = self.global_step

        sds_grad = self.emd_sds(z, batch)
        target = (z - sds_grad).detach()

        loss_sds = 0.5 * F.mse_loss(z, target, reduction="mean") / batch_size
        log_loss_dict = {
            "loss_sds_video": loss_sds.item(),
            "grad_norm": sds_grad.norm().item(),
        }

        return loss_sds, log_loss_dict

    def forward_with_encoder_chunk(self, batch, chunk_size=2):
        with torch.no_grad():
            x = self.get_input(batch)  # [BT, 3, H, W]
            T = batch["num_video_frames"]
            batch_size = x.shape[0] // T
            z = self.encode_first_stage(x)  # [BT, C, H_latent, W_latent]
            batch["global_step"] = self.global_step
            sds_grad = self.emd_sds(z, batch)

        num_chunks = math.ceil(z.shape[0] / chunk_size)

        for n in range(num_chunks):
            end_ind = min((n + 1) * chunk_size, z.shape[0])
            x_chunk = x[n * chunk_size : end_ind]
            z_chunk_recompute = self.encode_first_stage(x_chunk)

            target_chunk = (
                z_chunk_recompute - sds_grad[n * chunk_size : end_ind]
            ).detach()

            this_chunk_size = x_chunk.shape[0]
            loss_sds_chunk = (
                0.5
                * F.mse_loss(z_chunk_recompute, target_chunk, reduction="mean")
                * this_chunk_size
                / z.shape[0]
                / batch_size
            )

            loss_sds_chunk.backward()

        with torch.no_grad():
            target = (z - sds_grad).detach()
            loss_sds = 0.5 * F.mse_loss(z, target, reduction="mean") / batch_size
            log_loss_dict = {
                "loss_sds_video": loss_sds.item(),
                "grad_norm": sds_grad.norm().item(),
            }

            video_space_sds_grad = x.grad

        return video_space_sds_grad, loss_sds, log_loss_dict

    @torch.no_grad()
    def emd_sds(self, input_x, extra_input):
        """
        Args:
            input_x: [BT, C, H, W] in latent
            extra_input: dict
                "fps_id": [B]
                "motion_bucket_id": [B]
                "cond_aug": [B]
                "cond_frames_without_noise": [B, C, H, W]
                "cond_frames": [B, C, H, W]
        """

        num_frames = extra_input["num_video_frames"]
        batch_size = input_x.shape[0] // num_frames
        device = input_x.device
        # video = video.contiguous()

        extra_input["num_video_frames"] = num_frames

        # prepare c and uc

        batch, batch_uc = get_batch(
            get_unique_embedder_keys_from_conditioner(self.conditioner),
            extra_input,
            [1, num_frames],
            T=num_frames,
            device=device,
        )

        # keys would be be ['crossattn', 'vector', 'concat']
        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            batch_uc=batch_uc,
            force_uc_zero_embeddings=[
                "cond_frames",
                "cond_frames_without_noise",
            ],
        )

        for k in ["crossattn", "concat"]:
            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

        # after this should be
        # crossattn torch.Size([14, 1, 1024])
        # vector torch.Size([14, 768])
        # concat torch.Size([14, 4, 72, 128])

        additional_model_inputs = {}
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            int(2 * batch_size), num_frames
        ).to(device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        sigmas = self.loss_fn.sigma_sampler(batch_size).to(input_x)
        sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
        sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

        noise = torch.randn_like(input_x)  # [BT, C, H, W]

        sigmas_bc = append_dims(sigmas, input_x.ndim)  # [14, 1, 1, 1]
        noised_input = self.loss_fn.get_noised_input(
            sigmas_bc, noise, input_x
        )  # [BT, C, H, W]

        # [2BT, C, H, W], [2BT]
        bathced_xt, bathced_sigmas, bathched_c = self.sampler.guider.prepare_inputs(
            noised_input, sigmas, c, uc
        )
        # bathched_c[crossattn] => [2BT, 1, C] ;   bathched_c["concat"] => [2BT, C, H, W]; bathched_c["vector"] => [2BT, C_feat]

        # output shape [2BT, C, H, W]

        denoised = self.denoiser(
            self.model,
            bathced_xt,
            bathced_sigmas,
            bathched_c,
            **additional_model_inputs,
        )

        # [BT, C, H, W]
        denoised = self.sampler.guider(denoised, bathced_sigmas)

        sds_grad = (denoised - input_x) / sigmas_bc

        return sds_grad


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/svd_sds_wdecoder_engine.py
================================================
import math
from contextlib import contextmanager
from typing import Any, Dict, List, Optional, Tuple, Union

import pytorch_lightning as pl
import torch
from omegaconf import ListConfig, OmegaConf
from safetensors.torch import load_file as load_safetensors

from sgm.modules import UNCONDITIONAL_CONFIG

from sgm.modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER
from sgm.util import (
    default,
    disabled_train,
    get_obj_from_str,
    instantiate_from_config,
    append_dims,
)

from motionrep.utils.svd_helpper import (
    get_batch,
    get_unique_embedder_keys_from_conditioner,
)
from einops import rearrange, repeat

import torch.nn.functional as F
from sgm.modules.autoencoding.temporal_ae import VideoDecoder

import numpy as np


class SVDWDecSDSEngine(pl.LightningModule):
    """
    stable video diffusion engine
    """

    def __init__(
        self,
        network_config,
        denoiser_config,
        first_stage_config,
        conditioner_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        sampler_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        loss_fn_config: Union[None, Dict, ListConfig, OmegaConf] = None,
        discretization_config: Union[None, Dict, ListConfig, OmegaConf] = None,  # Added
        network_wrapper: Union[None, str] = None,
        ckpt_path: Union[None, str] = None,
        scale_factor: float = 1.0,
        disable_first_stage_autocast=False,
        input_key: str = "jpg",
        compile_model: bool = False,
        en_and_decode_n_samples_a_time: Optional[int] = None,
    ):
        super().__init__()
        self.input_key = input_key

        model = instantiate_from_config(network_config)
        self.model = get_obj_from_str(default(network_wrapper, OPENAIUNETWRAPPER))(
            model, compile_model=compile_model
        )
        self.model.eval()

        self.denoiser = instantiate_from_config(denoiser_config)
        assert self.denoiser is not None, "need denoiser"

        self.sampler = (
            instantiate_from_config(sampler_config)
            if sampler_config is not None
            else None
        )
        self.conditioner = instantiate_from_config(
            default(conditioner_config, UNCONDITIONAL_CONFIG)
        )

        self._init_first_stage(first_stage_config)

        self.loss_fn = (
            instantiate_from_config(loss_fn_config)
            if loss_fn_config is not None
            else None
        )

        self.scale_factor = scale_factor
        self.disable_first_stage_autocast = disable_first_stage_autocast

        if ckpt_path is not None:
            self.init_from_ckpt(ckpt_path)

        self.en_and_decode_n_samples_a_time = en_and_decode_n_samples_a_time

        assert discretization_config is not None, "need discretizer"
        self.discretizer = instantiate_from_config(discretization_config)

        # [1000]
        sigmas_all = self.discretizer.get_sigmas(1000)
        self.register_buffer("sigmas_all", sigmas_all)

    def init_from_ckpt(
        self,
        path: str,
    ) -> None:
        print("init svd engine from", path)
        if path.endswith("ckpt"):
            sd = torch.load(path, map_location="cpu")["state_dict"]
        elif path.endswith("safetensors"):
            sd = load_safetensors(path)
        elif path.endswith("bin"):
            sd = torch.load(path, map_location="cpu")
        else:
            raise NotImplementedError

        missing, unexpected = self.load_state_dict(sd, strict=False)
        print(
            f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
        )
        if len(missing) > 0:
            print(f"Missing Keys: {missing}")
        if len(unexpected) > 0:
            # print(f"Unexpected Keys: {unexpected}")
            pass

    def _init_first_stage(self, config):
        model = instantiate_from_config(config).eval()
        model.train = disabled_train
        for param in model.parameters():
            param.requires_grad = False
        self.first_stage_model = model

    def get_input(self, batch):
        # assuming unified data format, dataloader returns a dict.
        # image tensors should be scaled to -1 ... 1 and in bchw format
        return batch[self.input_key]

    def encode_first_stage(self, x):
        n_samples = default(self.en_and_decode_n_samples_a_time, x.shape[0])
        n_rounds = math.ceil(x.shape[0] / n_samples)
        all_out = []
        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
            for n in range(n_rounds):
                out = self.first_stage_model.encode(
                    x[n * n_samples : (n + 1) * n_samples]
                )
                all_out.append(out)
        z = torch.cat(all_out, dim=0)
        z = self.scale_factor * z
        return z

    @torch.no_grad()
    def decode_first_stage(self, z):
        z = 1.0 / self.scale_factor * z
        n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])

        n_rounds = math.ceil(z.shape[0] / n_samples)
        all_out = []
        with torch.autocast("cuda", enabled=not self.disable_first_stage_autocast):
            for n in range(n_rounds):
                if isinstance(self.first_stage_model.decoder, VideoDecoder):
                    kwargs = {"timesteps": len(z[n * n_samples : (n + 1) * n_samples])}
                else:
                    kwargs = {}
                out = self.first_stage_model.decode(
                    z[n * n_samples : (n + 1) * n_samples], **kwargs
                )
                all_out.append(out)
        out = torch.cat(all_out, dim=0)
        return out

    def forward(self, batch, sample_time_range=[0.02, 0.98]):
        """
        Args:
            batch["jpg"]: [BT, 3, H, W]. Videos range in
                [-1, 1]? TODO Dec 16. Check
            batch["cond_image"]: [B, 3, H, W]. in [-1, 1]?
                TODO: check shape
        """
        x = self.get_input(batch)  # [BT, 3, H, W]
        T = batch["num_video_frames"]
        batch_size = x.shape[0] // T
        z = self.encode_first_stage(x)  # [BT, C, H_latent, W_latent]
        batch["global_step"] = self.global_step

        with torch.no_grad():
            sds_grad = self.edm_sds(z, batch, sample_time_range)
            target = (z - sds_grad).detach()

        loss_sds = 0.5 * F.mse_loss(z, target, reduction="mean") / batch_size
        log_loss_dict = {
            "loss_sds_video": loss_sds.item(),
            "sds_delta_norm": sds_grad.norm().item(),
        }

        return loss_sds, log_loss_dict

    def forward_with_encoder_chunk(
        self, batch, chunk_size=2, sample_time_range=[0.02, 0.98]
    ):
        with torch.no_grad():
            x = self.get_input(batch)  # [BT, 3, H, W]
            T = batch["num_video_frames"]
            batch_size = x.shape[0] // T
            z = self.encode_first_stage(x)  # [BT, C, H_latent, W_latent]
            batch["global_step"] = self.global_step
            sds_grad, denoised_latent = self.edm_sds(z, batch, sample_time_range)

        num_chunks = math.ceil(z.shape[0] / chunk_size)

        for n in range(num_chunks):
            end_ind = min((n + 1) * chunk_size, z.shape[0])
            x_chunk = x[n * chunk_size : end_ind]
            z_chunk_recompute = self.encode_first_stage(x_chunk)

            target_chunk = (
                z_chunk_recompute - sds_grad[n * chunk_size : end_ind]
            ).detach()

            this_chunk_size = x_chunk.shape[0]
            assert this_chunk_size > 0
            # loss_sds_chunk = (
            #     0.5
            #     * F.mse_loss(z_chunk_recompute, target_chunk, reduction="mean")
            #     * this_chunk_size
            #     / z.shape[0]
            #     / batch_size
            # )
            loss_sds_chunk = 0.5 * F.mse_loss(z_chunk_recompute, target_chunk, reduction="sum") / batch_size

            loss_sds_chunk.backward()

        with torch.no_grad():
            target = (z - sds_grad).detach()
            loss_sds = 0.5 * F.mse_loss(z, target, reduction="sum") / batch_size
            log_loss_dict = {
                "latent_loss_sds": loss_sds.item(),
                "latent_sds_norm": sds_grad.norm().item(),
                "latent_sds_max": sds_grad.abs().max().item(),
                "latent_sds_mean": sds_grad.abs().mean().item(),
            }

            video_space_sds_grad = x.grad

        return video_space_sds_grad, log_loss_dict, denoised_latent

    @torch.no_grad()
    def edm_sds(self, input_x, extra_input, sample_time_range=[0.02, 0.98]):
        """
        Args:
            input_x: [BT, C, H, W] in latent
            extra_input: dict
                "fps_id": [B]
                "motion_bucket_id": [B]
                "cond_aug": [B]
                "cond_frames_without_noise": [B, C, H, W]
                "cond_frames": [B, C, H, W]
            sample_time_range: [t_min, t_max]
        """

        # step-1: prepare inputs
        num_frames = extra_input["num_video_frames"]
        batch_size = input_x.shape[0] // num_frames
        device = input_x.device
        # video = video.contiguous()

        extra_input["num_video_frames"] = num_frames

        # prepare c and uc

        batch, batch_uc = get_batch(
            get_unique_embedder_keys_from_conditioner(self.conditioner),
            extra_input,
            [1, num_frames],
            T=num_frames,
            device=device,
        )

        # keys would be be ['crossattn', 'vector', 'concat']
        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            batch_uc=batch_uc,
            force_uc_zero_embeddings=[
                "cond_frames",
                "cond_frames_without_noise",
            ],
        )

        for k in ["crossattn", "concat"]:
            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

        # after this should be
        # crossattn [14, 1, 1024];  vector [14, 768]; concat [14, 4, 72, 128]
        additional_model_inputs = {}
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            int(2 * batch_size), num_frames
        ).to(device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        # step-2: sample t and sigmas, then noise
        sampled_t = np.random.randint(
            low=int(sample_time_range[0] * self.sigmas_all.shape[0]),
            high=int(sample_time_range[1] * self.sigmas_all.shape[0]),
            size=(batch_size),
        ).tolist() # list of index time t [B]
        sigmas = self.sigmas_all[sampled_t]

        # sigmas = self.loss_fn.sigma_sampler(batch_size).to(input_x)
        sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
        sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

        noise = torch.randn_like(input_x)  # [BT, C, H, W]

        sigmas_bc = append_dims(sigmas, input_x.ndim)  # [14, 1, 1, 1]

        noised_input = self.loss_fn.get_noised_input(
            sigmas_bc, noise, input_x
        )  # [BT, C, H, W]

        # step-3: prepare conditional and unconditional inputs
        # [2BT, C, H, W], [2BT]
        bathced_xt, bathced_sigmas, bathched_c = self.sampler.guider.prepare_inputs(
            noised_input, sigmas, c, uc
        )
        # bathched_c["crossattn"] => [2BT, 1, C] ;   bathched_c["concat"] => [2BT, C, H, W]; bathched_c["vector"] => [2BT, C_feat]

        # output shape [2BT, C, H, W]
        denoised = self.denoiser(
            self.model,
            bathced_xt,
            bathced_sigmas,
            bathched_c,
            **additional_model_inputs,
        )

        # step-4: cfg guidance and compute sds_grad
        # [BT, C, H, W]
        denoised = self.sampler.guider(denoised, bathced_sigmas)

        # sds_grad = (input_x - denoised) / sigmas_bc
        sds_grad = (input_x - denoised) / torch.norm((input_x - denoised))

        return sds_grad, denoised

    @torch.no_grad()
    def edm_sds_multistep(self, input_x, extra_input, sample_time_range=[0.02, 0.84], num_step=4, total_steps=25):
        """
        From t = 20 sample to t = 980.
        Args:
            input_x: [BT, C, H, W] in latent
            extra_input: dict
                "fps_id": [B]
                "motion_bucket_id": [B]
                "cond_aug": [B]
                "cond_frames_without_noise": [B, C, H, W]
                "cond_frames": [B, C, H, W]
            sample_time_range: [t_min, t_max]
        """

        # step-1: prepare inputs
        num_frames = extra_input["num_video_frames"]
        batch_size = input_x.shape[0] // num_frames
        device = input_x.device
        # video = video.contiguous()

        extra_input["num_video_frames"] = num_frames

        # prepare c and uc

        batch, batch_uc = get_batch(
            get_unique_embedder_keys_from_conditioner(self.conditioner),
            extra_input,
            [1, num_frames],
            T=num_frames,
            device=device,
        )

        # keys would be be ['crossattn', 'vector', 'concat']
        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            batch_uc=batch_uc,
            force_uc_zero_embeddings=[
                "cond_frames",
                "cond_frames_without_noise",
            ],
        )

        for k in ["crossattn", "concat"]:
            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

        # after this should be
        # crossattn [14, 1, 1024];  vector [14, 768]; concat [14, 4, 72, 128]
        additional_model_inputs = {}
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            int(2 * batch_size), num_frames
        ).to(device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        # step-2: sample t and sigmas, then noise
        sampled_t = np.random.randint(
            low=int(sample_time_range[0] * self.sigmas_all.shape[0]),
            high=int(sample_time_range[1] * self.sigmas_all.shape[0]),
            size=(batch_size),
        ) # np.array of index time t [B]

        step_stride = len(self.sigmas_all) // total_steps

        sigma_sum = 0.0
        for i in range(num_step):
            sampled_t += step_stride * i
            sampled_t = np.clip(sampled_t, 0, len(self.sigmas_all) - 2)


            # [B]
            sigmas = self.sigmas_all[sampled_t]

            # sigmas = self.loss_fn.sigma_sampler(batch_size).to(input_x)
            sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
            sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

            sigmas_bc = append_dims(sigmas, input_x.ndim)  # [14, 1, 1, 1]

            if i == 0:

                noise = torch.randn_like(input_x)  # [BT, C, H, W]

                noised_input = self.loss_fn.get_noised_input(
                    sigmas_bc, noise, input_x
                )  # [BT, C, H, W]
            else:
                # dt is negative
                dt = append_dims(sigmas - prev_sigmas, input_x.ndim)

                dx = (noised_input - denoised) / append_dims(prev_sigmas, input_x.ndim)
                noised_input = noised_input + dt * dx

            denoised = self.sampler_step(sigmas, noised_input, c, uc,
                                         num_frames=num_frames, additional_model_inputs=additional_model_inputs)
            prev_sigmas = sigmas
            sigma_sum += sigmas_bc

        # TODO, so many sigmas, which to use?
        # sds_grad = (input_x - denoised) / sigmas_bc
        # sds_grad = (input_x - denoised) / sigma_sum
        sds_grad = (input_x - denoised) / torch.norm((input_x - denoised))

        return sds_grad, denoised

    @torch.no_grad()
    def resample_multistep(self, input_x, extra_input, sample_time_range=[0.02, 0.84], num_step=4):
        """
        From t = 20 sample to t = 980.
        Args:
            input_x: [BT, C, H, W] in latent
            extra_input: dict
                "fps_id": [B]
                "motion_bucket_id": [B]
                "cond_aug": [B]
                "cond_frames_without_noise": [B, C, H, W]
                "cond_frames": [B, C, H, W]
            sample_time_range: [t_min, t_max]
        """

        # step-1: prepare inputs
        num_frames = extra_input["num_video_frames"]
        batch_size = input_x.shape[0] // num_frames
        device = input_x.device
        # video = video.contiguous()

        extra_input["num_video_frames"] = num_frames

        # prepare c and uc

        batch, batch_uc = get_batch(
            get_unique_embedder_keys_from_conditioner(self.conditioner),
            extra_input,
            [1, num_frames],
            T=num_frames,
            device=device,
        )

        # keys would be be ['crossattn', 'vector', 'concat']
        c, uc = self.conditioner.get_unconditional_conditioning(
            batch,
            batch_uc=batch_uc,
            force_uc_zero_embeddings=[
                "cond_frames",
                "cond_frames_without_noise",
            ],
        )

        for k in ["crossattn", "concat"]:
            uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
            uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
            c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
            c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

        # after this should be
        # crossattn [14, 1, 1024];  vector [14, 768]; concat [14, 4, 72, 128]
        additional_model_inputs = {}
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            int(2 * batch_size), num_frames
        ).to(device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        # step-2: sample t and sigmas, then noise
        sampled_t = np.random.randint(
            low=int(sample_time_range[0] * self.sigmas_all.shape[0]),
            high=int(sample_time_range[1] * self.sigmas_all.shape[0]),
            size=(batch_size),
        ) # np.array of index time t [B]

        sampled_steps = np.linspace(sampled_t[0], len(self.sigmas_all) - 1, num_step, dtype=int)
        sigma_sum = 0.0
        for i in range(len(sampled_steps)):
            sampled_t = sampled_steps[[i]]

            # sampled_t = np.clip(sampled_t, 0, len(self.sigmas_all) - 2)

            # [B]
            sigmas = self.sigmas_all[sampled_t]

            # sigmas = self.loss_fn.sigma_sampler(batch_size).to(input_x)
            sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
            sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

            sigmas_bc = append_dims(sigmas, input_x.ndim)  # [14, 1, 1, 1]

            if i == 0:

                noise = torch.randn_like(input_x)  # [BT, C, H, W]

                noised_input = self.loss_fn.get_noised_input(
                    sigmas_bc, noise, input_x
                )  # [BT, C, H, W]
            else:
                # dt is negative
                dt = append_dims(sigmas - prev_sigmas, input_x.ndim)

                dx = (noised_input - denoised) / append_dims(prev_sigmas, input_x.ndim)
                noised_input = noised_input + dt * dx

            denoised = self.sampler_step(sigmas, noised_input, c, uc,
                                         num_frames=num_frames, additional_model_inputs=additional_model_inputs)
            prev_sigmas = sigmas
            sigma_sum += sigmas_bc

        # TODO, so many sigmas, which to use?
        # sds_grad = (input_x - denoised) / sigmas_bc
        # sds_grad = (input_x - denoised) / sigma_sum
        # sds_grad = (input_x - denoised) / torch.norm((input_x - denoised))

        return denoised

    def sampler_step(self, sigma, noised_input, c, uc=None, num_frames=None, additional_model_inputs=None):

        # step-3: prepare conditional and unconditional inputs
        # [2BT, C, H, W], [2BT]
        bathced_xt, bathced_sigmas, bathched_c = self.sampler.guider.prepare_inputs(
            noised_input, sigma, c, uc
        )
        # bathched_c["crossattn"] => [2BT, 1, C] ;   bathched_c["concat"] => [2BT, C, H, W]; bathched_c["vector"] => [2BT, C_feat]

        # output shape [2BT, C, H, W]
        denoised = self.denoiser(
            self.model,
            bathced_xt,
            bathced_sigmas,
            bathched_c,
            **additional_model_inputs,
        )

        # step-4: cfg guidance and compute sds_grad
        # [BT, C, H, W]
        denoised = self.sampler.guider(denoised, bathced_sigmas)

        return denoised


================================================
FILE: projects/uncleaned_train/motionrep/diffusion/video_diffusion_loss.py
================================================
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.nn as nn

from sgm.modules.autoencoding.lpips.loss.lpips import LPIPS
from sgm.modules.encoders.modules import GeneralConditioner
from sgm.util import append_dims, instantiate_from_config
from sgm.modules.diffusionmodules.denoiser import Denoiser
from einops import rearrange, repeat


class StandardVideoDiffusionLoss(nn.Module):
    def __init__(
        self,
        sigma_sampler_config: dict,
        loss_weighting_config: dict,
        loss_type: str = "l2",
        offset_noise_level: float = 0.0,
        batch2model_keys: Optional[Union[str, List[str]]] = None,
    ):
        super().__init__()

        assert loss_type in ["l2", "l1", "lpips"]

        self.sigma_sampler = instantiate_from_config(sigma_sampler_config)
        self.loss_weighting = instantiate_from_config(loss_weighting_config)

        self.loss_type = loss_type
        self.offset_noise_level = offset_noise_level

        if loss_type == "lpips":
            self.lpips = LPIPS().eval()

        if not batch2model_keys:
            batch2model_keys = []

        if isinstance(batch2model_keys, str):
            batch2model_keys = [batch2model_keys]

        self.batch2model_keys = set(batch2model_keys)

    def get_noised_input(
        self, sigmas_bc: torch.Tensor, noise: torch.Tensor, input: torch.Tensor
    ) -> torch.Tensor:
        noised_input = input + noise * sigmas_bc
        return noised_input

    def forward(
        self,
        network: nn.Module,
        denoiser: Denoiser,
        conditioner: GeneralConditioner,
        input: torch.Tensor,
        batch: Dict,
    ) -> torch.Tensor:
        cond = conditioner(batch)
        num_frames = batch["num_video_frames"]
        for k in ["crossattn", "concat"]:
            cond[k] = repeat(cond[k], "b ... -> b t ...", t=num_frames)
            cond[k] = rearrange(cond[k], "b t ... -> (b t) ...", t=num_frames)

        return self._forward(network, denoiser, cond, input, batch)

    def _forward(
        self,
        network: nn.Module,
        denoiser: Denoiser,
        cond: Dict,
        input: torch.Tensor,
        batch: Dict,
    ) -> Tuple[torch.Tensor, Dict]:
        additional_model_inputs = {
            key: batch[key] for key in self.batch2model_keys.intersection(batch)
        }
        # print("pre check additional inputs", additional_model_inputs.keys())
        num_frames = batch["num_video_frames"]
        batch_size = input.shape[0] // num_frames
        additional_model_inputs["image_only_indicator"] = torch.zeros(
            batch_size, num_frames
        ).to(input.device)
        additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

        # sigmas = self.sigma_sampler(input.shape[0]).to(input)
        sigmas = self.sigma_sampler(batch_size).to(input)
        sigmas = repeat(sigmas, "b ... -> b t ...", t=num_frames)
        sigmas = rearrange(sigmas, "b t ... -> (b t) ...", t=num_frames)

        noise = torch.randn_like(input)
        if self.offset_noise_level > 0.0:
            offset_shape = (
                (input.shape[0], 1, input.shape[2])
                if self.n_frames is not None
                else (input.shape[0], input.shape[1])
            )
            noise = noise + self.offset_noise_level * append_dims(
                torch.randn(offset_shape, device=input.device),
                input.ndim,
            )
        sigmas_bc = append_dims(sigmas, input.ndim)
        noised_input = self.get_noised_input(sigmas_bc, noise, input)

        model_output = denoiser(
            network, noised_input, sigmas, cond, **additional_model_inputs
        )
        w = append_dims(self.loss_weighting(sigmas), input.ndim)
        return self.get_loss(model_output, input, w)

    def get_loss(self, model_output, target, w):
        if self.loss_type == "l2":
            return torch.mean(
                (w * (model_output - target) ** 2).reshape(target.shape[0], -1), 1
            )
        elif self.loss_type == "l1":
            return torch.mean(
                (w * (model_output - target).abs()).reshape(target.shape[0], -1), 1
            )
        elif self.loss_type == "lpips":
            loss = self.lpips(model_output, target).reshape(-1)
            return loss
        else:
            raise NotImplementedError(f"Unknown loss type {self.loss_type}")


================================================
FILE: projects/uncleaned_train/motionrep/field_components/encoding.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Optional, Sequence, Tuple, List
from motionrep.losses.smoothness_loss import compute_plane_smoothness, compute_plane_tv


class TemporalKplanesEncoding(nn.Module):
    """

    Args:
        resolutions (Sequence[int]): xyzt resolutions.
    """

    def __init__(
        self,
        resolutions: Sequence[int],
        feat_dim: int = 32,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  # Literal["sum", "product", "cat"] = "sum",
    ):
        super().__init__()

        self.resolutions = resolutions

        if reduce == "cat":
            feat_dim = feat_dim // 3
        self.feat_dim = feat_dim

        self.reduce = reduce

        self.in_dim = 4

        self.plane_coefs = nn.ParameterList()

        self.coo_combs = [[0, 3], [1, 3], [2, 3]]
        # [(x, t), (y, t), (z, t)]
        for coo_comb in self.coo_combs:
            # [feat_dim, time_resolution, spatial_resolution]
            new_plane_coef = nn.Parameter(
                torch.empty(
                    [
                        self.feat_dim,
                        resolutions[coo_comb[1]],
                        resolutions[coo_comb[0]],  # flip?
                    ]
                )
            )

            # when init to ones?

            nn.init.uniform_(new_plane_coef, a=init_a, b=init_b)
            self.plane_coefs.append(new_plane_coef)

    def forward(self, inp: Float[Tensor, "*bs 4"]):
        output = 1.0 if self.reduce == "product" else 0.0
        if self.reduce == "cat":
            output = []
        for ci, coo_comb in enumerate(self.coo_combs):
            grid = self.plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(self.feat_dim, -1).T  # [flattened_bs, output_dim]

            if self.reduce == "product":
                output = output * interp
            elif self.reduce == "sum":
                output = output + interp
            elif self.reduce == "cat":
                output.append(interp)

        if self.reduce == "cat":
            # [flattened_bs, output_dim * 3]
            output = torch.cat(output, dim=-1)

        return output

    def compute_temporal_smoothness(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_smoothness(plane_coef)

        return ret_loss

    def compute_plane_tv(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_tv(plane_coef)

        return ret_loss

    def visualize(
        self,
    ) -> Tuple[Float[Tensor, "3 H W"]]:
        """Visualize the encoding as a RGB images

        Returns:
            Tuple[Float[Tensor, "3 H W"]]
        """
        pass

    @staticmethod
    def functional_forward(
        plane_coefs: List[Float[Tensor, "feat_dim H W"]],
        inp: Float[Tensor, "*bs 4"],
        reduce: str = "sum",
        coo_combs: Optional[List[List[int]]] = [[0, 3], [1, 3], [2, 3]],
    ):
        assert reduce in ["sum", "product", "cat"]
        output = 1.0 if reduce == "product" else 0.0

        if reduce == "cat":
            output = []
        for ci, coo_comb in enumerate(coo_combs):
            grid = plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            feat_dim = grid.shape[1]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(feat_dim, -1).T  # [flattened_bs, output_dim]

            if reduce == "product":
                output = output * interp
            elif reduce == "sum":
                output = output + interp
            elif reduce == "cat":
                output.append(interp)

        if reduce == "cat":
            # [flattened_bs, output_dim * 3]
            output = torch.cat(output, dim=-1)

        return output


class TriplanesEncoding(nn.Module):
    """

    Args:
        resolutions (Sequence[int]): xyz resolutions.
    """

    def __init__(
        self,
        resolutions: Sequence[int],
        feat_dim: int = 32,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  # Literal["sum", "product", "cat"] = "sum",
    ):
        super().__init__()

        self.resolutions = resolutions

        if reduce == "cat":
            feat_dim = feat_dim#  // 3
        self.feat_dim = feat_dim

        self.reduce = reduce

        self.in_dim = 3

        self.plane_coefs = nn.ParameterList()

        self.coo_combs = [[0, 1], [0, 2], [1, 2]]
        # [(x, t), (y, t), (z, t)]
        for coo_comb in self.coo_combs:
            new_plane_coef = nn.Parameter(
                torch.empty(
                    [
                        self.feat_dim,
                        resolutions[coo_comb[1]],
                        resolutions[coo_comb[0]],
                    ]
                )
            )

            # when init to ones?

            nn.init.uniform_(new_plane_coef, a=init_a, b=init_b)
            self.plane_coefs.append(new_plane_coef)

    def forward(self, inp: Float[Tensor, "*bs 3"]):
        output = 1.0 if self.reduce == "product" else 0.0
        if self.reduce == "cat":
            output = []
        for ci, coo_comb in enumerate(self.coo_combs):
            grid = self.plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(self.feat_dim, -1).T  # [flattened_bs, output_dim]

            if self.reduce == "product":
                output = output * interp
            elif self.reduce == "sum":
                output = output + interp
            elif self.reduce == "cat":
                output.append(interp)

        if self.reduce == "cat":
            # [flattened_bs, output_dim * 3]
            output = torch.cat(output, dim=-1)

        return output

    def compute_plane_tv(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_tv(plane_coef)

        return ret_loss


class PlaneEncoding(nn.Module):
    """

    Args:
        resolutions (Sequence[int]): xyz resolutions.
    """

    def __init__(
        self,
        resolutions: Sequence[int],  # [y_res, x_res]
        feat_dim: int = 32,
        init_a: float = 0.1,
        init_b: float = 0.5,
    ):
        super().__init__()

        self.resolutions = resolutions

        self.feat_dim = feat_dim
        self.in_dim = 2

        self.plane_coefs = nn.ParameterList()

        self.coo_combs = [[0, 1]]
        for coo_comb in self.coo_combs:
            new_plane_coef = nn.Parameter(
                torch.empty(
                    [
                        self.feat_dim,
                        resolutions[coo_comb[1]],
                        resolutions[coo_comb[0]],
                    ]
                )
            )

            # when init to ones?

            nn.init.uniform_(new_plane_coef, a=init_a, b=init_b)
            self.plane_coefs.append(new_plane_coef)

    def forward(self, inp: Float[Tensor, "*bs 2"]):

        for ci, coo_comb in enumerate(self.coo_combs):
            grid = self.plane_coefs[ci].unsqueeze(0)  # [1, feature_dim, reso1, reso2]
            coords = inp[..., coo_comb].view(1, 1, -1, 2)  # [1, 1, flattened_bs, 2]

            interp = F.grid_sample(
                grid, coords, align_corners=True, padding_mode="border"
            )  # [1, output_dim, 1, flattened_bs]
            interp = interp.view(self.feat_dim, -1).T  # [flattened_bs, output_dim]

            output = interp

        return output

    def compute_plane_tv(
        self,
    ):
        ret_loss = 0.0

        for plane_coef in self.plane_coefs:
            ret_loss += compute_plane_tv(plane_coef)

        return ret_loss


class TemporalNeRFEncoding(nn.Module):
    def __init__(
        self,
        in_dim,  # : int,
        num_frequencies: int,
        min_freq_exp: float,
        max_freq_exp: float,
        log_scale: bool = False,
        include_input: bool = False,
    ) -> None:
        super().__init__()
        self.in_dim = in_dim
        self.num_frequencies = num_frequencies
        self.min_freq = min_freq_exp
        self.max_freq = max_freq_exp
        self.log_scale = log_scale
        self.include_input = include_input

    def get_out_dim(self) -> int:
        if self.in_dim is None:
            raise ValueError("Input dimension has not been set")
        out_dim = self.in_dim * self.num_frequencies * 2
        if self.include_input:
            out_dim += self.in_dim
        return out_dim

    def forward(
        self,
        in_tensor: Float[Tensor, "*bs input_dim"],
    ) -> Float[Tensor, "*bs output_dim"]:
        """Calculates NeRF encoding. If covariances are provided the encodings will be integrated as proposed
            in mip-NeRF.

        Args:
            in_tensor: For best performance, the input tensor should be between 0 and 1.
            covs: Covariances of input points.
        Returns:
            Output values will be between -1 and 1
        """
        scaled_in_tensor = 2 * torch.pi * in_tensor  # scale to [0, 2pi]
    
        # freqs = 2 ** torch.linspace(
        freqs = torch.linspace(
            self.min_freq, self.max_freq, self.num_frequencies, device=in_tensor.device
        )
        if self.log_scale:
            freqs = 2 ** freqs
        scaled_inputs = (
            scaled_in_tensor[..., None] * freqs
        )  # [..., "input_dim", "num_scales"]
        scaled_inputs = scaled_inputs.view(
            *scaled_inputs.shape[:-2], -1
        )  # [..., "input_dim" * "num_scales"]

        encoded_inputs = torch.sin(
            torch.cat([scaled_inputs, scaled_inputs + torch.pi / 2.0], dim=-1)
        )
        return encoded_inputs


================================================
FILE: projects/uncleaned_train/motionrep/field_components/mlp.py
================================================
"""
Mostly from nerfstudio: https://github.com/nerfstudio-project/nerfstudio/blob/main/nerfstudio/field_components/mlp.py
"""
from typing import Optional, Set, Tuple, Union

import torch
from jaxtyping import Float
from torch import Tensor, nn


class MLP(nn.Module):
    def __init__(
        self,
        in_dim: int,
        num_layers: int,
        layer_width: int,
        out_dim: Optional[int] = None,
        skip_connections: Optional[Tuple[int]] = None,
        activation: Optional[nn.Module] = nn.ReLU(),
        out_activation: Optional[nn.Module] = None,
        zero_init = False,
    ) -> None:
        super().__init__()
        self.in_dim = in_dim
        assert self.in_dim > 0
        self.out_dim = out_dim if out_dim is not None else layer_width
        self.num_layers = num_layers
        self.layer_width = layer_width
        self.skip_connections = skip_connections
        self._skip_connections: Set[int] = (
            set(skip_connections) if skip_connections else set()
        )
        self.activation = activation
        self.out_activation = out_activation
        self.net = None
        self.zero_init = zero_init

        self.build_nn_modules()

    def build_nn_modules(self) -> None:
        """Initialize multi-layer perceptron."""
        layers = []
        if self.num_layers == 1:
            layers.append(nn.Linear(self.in_dim, self.out_dim))
        else:
            for i in range(self.num_layers - 1):
                if i == 0:
                    assert (
                        i not in self._skip_connections
                    ), "Skip connection at layer 0 doesn't make sense."
                    layers.append(nn.Linear(self.in_dim, self.layer_width))
                elif i in self._skip_connections:
                    layers.append(
                        nn.Linear(self.layer_width + self.in_dim, self.layer_width)
                    )
                else:
                    layers.append(nn.Linear(self.layer_width, self.layer_width))
            layers.append(nn.Linear(self.layer_width, self.out_dim))
        self.layers = nn.ModuleList(layers)

        if self.zero_init:
            torch.nn.init.zeros_(self.layers[-1].weight)
            torch.nn.init.zeros_(self.layers[-1].bias)

    def pytorch_fwd(
        self, in_tensor: Float[Tensor, "*bs in_dim"]
    ) -> Float[Tensor, "*bs out_dim"]:
        """Process input with a multilayer perceptron.

        Args:
            in_tensor: Network input

        Returns:
            MLP network output
        """
        x = in_tensor
        for i, layer in enumerate(self.layers):
            # as checked in `build_nn_modules`, 0 should not be in `_skip_connections`
            if i in self._skip_connections:
                x = torch.cat([in_tensor, x], -1)
            x = layer(x)
            if self.activation is not None and i < len(self.layers) - 1:
                x = self.activation(x)
        if self.out_activation is not None:
            x = self.out_activation(x)
        return x

    def forward(
        self, in_tensor: Float[Tensor, "*bs in_dim"]
    ) -> Float[Tensor, "*bs out_dim"]:
        return self.pytorch_fwd(in_tensor)


================================================
FILE: projects/uncleaned_train/motionrep/fields/dct_trajectory_field.py
================================================
import torch
import torch.nn as nn

from motionrep.utils.dct import dct, idct, dct3d, idct_3d


class DCTTrajctoryField(nn.Module):
    def __init__(
        self,
    ):
        super().__init__()
        pass

    def forward(self, x):
        pass

    def query_points_at_time(self, x, t):
        pass


================================================
FILE: projects/uncleaned_train/motionrep/fields/discrete_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple
from motionrep.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from motionrep.field_components.mlp import MLP
from motionrep.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from motionrep.data.scene_box import SceneBox


class PointSetMotionSE3(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        inp_x: Float[Tensor, "*bs 3"],
        aabb: Float[Tensor, "2 3"],
        rotation_type: Literal["quaternion", "6d"] = "6d",
        num_frames: int = 20,
        distance_lamba=100.0,
        topk_nn: int = 20,  # the same neighboor size as dynamic gaussian
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        output_dim_dict = {"quaternion": 4 + 3, "6d": 6 + 3}
        self.output_dim = output_dim_dict[rotation_type]
        self.rotation_type = rotation_type

        self.register_buffer("inp_x", inp_x.detach())

        self.num_frames = num_frames

        # init parameters:
        translation = nn.Parameter(
            torch.zeros(num_frames + 1, inp_x.shape[0], 3).requires_grad_(True)
        )
        rotation = nn.Parameter(
            torch.ones(
                (num_frames + 1, inp_x.shape[0], self.output_dim - 3)
            ).requires_grad_(True)
        )
        self.register_parameter("translation", translation)
        self.register_parameter("rotation", rotation)

        # [num_points, topk]
        print(inp_x.shape, "input shape gaussian")
        knn_dist, knn_ind = self.construct_knn(inp_x, topk=topk_nn)

        # [num_points, topk]
        self.distance_weight = torch.exp(-1.0 * distance_lamba * knn_dist)
        self.knn_index = knn_ind  # torch.long

        self.precompute_isometry = self.prepare_isometry(inp_x, knn_ind)

        self.inp_time_list = []

    def construct_knn(self, inpx: Float[Tensor, "*bs 3"], topk=10, chunk_size=5000):
        # compute topk nearest neighbors for each point, and the distance

        knn_dist_list, knn_ind_list = [], []
        num_step = inpx.shape[0] // chunk_size + 1

        with torch.no_grad():
            for i in range(num_step):
                end_ind = min((i + 1) * chunk_size, inpx.shape[0])

                src_points = inpx[i * chunk_size : end_ind]
                # compute the distance matrix
                cdist = torch.cdist(src_points, inpx)

                print(cdist.shape, "cdist")
                # get the topk nearest neighbors
                knn_dist, knn_ind = torch.topk(cdist, topk, dim=1, largest=False)
                knn_dist_list.append(knn_dist)
                knn_ind_list.append(knn_ind)

            knn_dist = torch.cat(knn_dist_list, dim=0)
            knn_ind = torch.cat(knn_ind_list, dim=0)
        return knn_dist, knn_ind

    def prepare_isometry(self, points, knn_ind):
        # [num_points, topk, 3]
        p_nn = points[knn_ind]

        dsp = points[:, None, :] - p_nn

        distance = torch.norm(dsp, dim=-1)

        # [num_points, topk]
        return distance

    def _forward_single_time(self, time_ind: int):
        if self.rotation_type == "6d":
            rotation_6d, translation = (
                self.rotation[time_ind],
                self.translation[time_ind],
            )
            R_mat = rotation_6d_to_matrix(rotation_6d)

        elif self.rotation_type == "quaternion":
            quat, translation = self.rotation[time_ind], self.translation[time_ind]

            quat = torch.tanh(quat)
            R_mat = quaternion_to_matrix(quat)

        return R_mat, translation

    def forward(
        self,
        inp: Float[Tensor, "*bs 4"],
        **kwargs,
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        time_ind = torch.round(inpt * (self.num_frames)).long()[0].item()
        R_mat, translation = self._forward_single_time(time_ind)

        self.inp_time_list.append(time_ind)
        if len(self.inp_time_list) > 20:
            self.inp_time_list.pop(0)

        return R_mat, translation

    def compute_smoothess_loss(
        self,
    ):
        # temporal_smoothness_loss = torch.tensor([0.0]).cuda()
        temporal_smoothness_loss = self.compute_isometry_loss()
        smothness_loss = self.compute_arap_loss()

        return temporal_smoothness_loss, smothness_loss

    def compute_arap_loss(
        self,
    ):
        arap_loss = 0.0

        # random sample 16 frames
        random_frame_ind_list = torch.randint(0, self.num_frames - 1, (16,))

        for i in self.inp_time_list:
            r1, t1 = self._forward_single_time(i)
            r2, t2 = self._forward_single_time(i + 1)

            # [num_points, topk, 3, 3], [num_points, topk, 3]
            r1_nn, t1_nn = r1[self.knn_index], t1[self.knn_index]
            r2_nn, t2_nn = r2[self.knn_index], t2[self.knn_index]

            # displacement between neighboor points
            #   shape of [num_points, topk, 3]
            dsp_t0 = t1_nn - t1[:, None, :]
            dsp_t1 = t2_nn - t2[:, None, :]

            # rotation matrix from frame-1 to frame-0

            r_mat_1to0 = torch.bmm(r1, r2.transpose(1, 2))  # [N, 3, 3]
            # [N, 3, 3] => [N, topk, 3, 3]
            r_mat_1to0 = r_mat_1to0.unsqueeze(1).repeat(
                1, self.knn_index.shape[1], 1, 1
            )
            dsp_t1_to_0 = torch.matmul(r_mat_1to0, dsp_t1[:, :, :, None]).squeeze(-1)
            # compute the arap loss
            arap_loss += torch.mean(
                torch.norm(dsp_t0 - dsp_t1_to_0, dim=-1) * self.distance_weight
            )
        return arap_loss

    def compute_isometry_loss(
        self,
    ):
        iso_loss = 0.0
        # random sample 16 frames
        random_frame_ind_list = torch.randint(0, self.num_frames - 1, (16,))

        for i in self.inp_time_list:
            r1, t1 = self._forward_single_time(i)
            points = self.inp_x + t1
            distance_mat = self.prepare_isometry(points, self.knn_index)

            iso_loss += torch.mean(
                torch.abs(distance_mat - self.precompute_isometry)
                * self.distance_weight
            )
        return iso_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        R, t = self(inp)

        rec_traj = torch.bmm(R, inpx.unsqueeze(-1)).squeeze(-1) + t

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss


================================================
FILE: projects/uncleaned_train/motionrep/fields/mul_offset_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple, List
from motionrep.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from motionrep.field_components.mlp import MLP
from motionrep.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from motionrep.data.scene_box import SceneBox


class MulTemporalKplanesOffsetfields(nn.Module):
    """Multiple Temporal Kplanes SE(3) fields.

        Decoder is shared, but plane coefs are different.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions_list: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        add_spatial_triplane: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = 3

        self.temporal_kplanes_encoding_list = nn.ModuleList(
            [
                TemporalKplanesEncoding(resolutions, feat_dim, init_a, init_b, reduce)
                for resolutions in resolutions_list
            ]
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding_list = nn.ModuleList(
                [
                    TriplanesEncoding(
                        resolutions[:-1], feat_dim, init_a, init_b, reduce
                    )
                    for resolutions in resolutions_list
                ]
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 4"], dataset_indx: Int[Tensor, "1"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)

        # for loop in batch dimension

        output = self.temporal_kplanes_encoding_list[dataset_indx](inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding_list[dataset_indx](inp)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        return output

    def compute_smoothess_loss(
        self,
    ):
        temporal_smoothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            temporal_smoothness_loss += (
                temporal_kplanes_encoding.compute_temporal_smoothness()
            )

        smothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            smothness_loss += temporal_kplanes_encoding.compute_plane_tv()

        if self.add_spatial_triplane:
            for spatial_kplanes_encoding in self.spatial_kplanes_encoding_list:
                smothness_loss += spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss, temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        output = self(inp)

        rec_traj = inpx + output

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss

    def arap_loss(self, inp):
        pass


================================================
FILE: projects/uncleaned_train/motionrep/fields/mul_se3_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple
from motionrep.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from motionrep.field_components.mlp import MLP
from motionrep.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from motionrep.data.scene_box import SceneBox


class MulTemporalKplanesSE3fields(nn.Module):
    """Multiple Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions_list: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        rotation_type: Literal["quaternion", "6d"] = "6d",
        add_spatial_triplane: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        output_dim_dict = {"quaternion": 4 + 3, "6d": 6 + 3}
        self.output_dim = output_dim_dict[rotation_type]
        self.rotation_type = rotation_type

        self.temporal_kplanes_encoding_list = nn.ModuleList(
            [
                TemporalKplanesEncoding(resolutions, feat_dim, init_a, init_b, reduce)
                for resolutions in resolutions_list
            ]
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding_list = nn.ModuleList(
                [
                    TriplanesEncoding(
                        resolutions[:-1], feat_dim, init_a, init_b, reduce
                    )
                    for resolutions in resolutions_list
                ]
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 4"], dataset_indx: Int[Tensor, "1"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)

        # for loop in batch dimension

        output = self.temporal_kplanes_encoding_list[dataset_indx](inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding_list[dataset_indx](inp)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        if self.rotation_type == "6d":
            rotation_6d, translation = output[:, :6], output[:, 6:]
            R_mat = rotation_6d_to_matrix(rotation_6d)

        elif self.rotation_type == "quaternion":
            quat, translation = output[:, :4], output[:, 4:]

            # tanh and normalize
            quat = torch.tanh(quat)

            R_mat = quaternion_to_matrix(quat)

        return R_mat, translation

    def compute_smoothess_loss(
        self,
    ):
        temporal_smoothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            temporal_smoothness_loss += (
                temporal_kplanes_encoding.compute_temporal_smoothness()
            )

        smothness_loss = 0.0
        for temporal_kplanes_encoding in self.temporal_kplanes_encoding_list:
            smothness_loss += temporal_kplanes_encoding.compute_plane_tv()

        if self.add_spatial_triplane:
            for spatial_kplanes_encoding in self.spatial_kplanes_encoding_list:
                smothness_loss += spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss, temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        R, t = self(inp)

        rec_traj = torch.bmm(R, inpx.unsqueeze(-1)).squeeze(-1) + t

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss


================================================
FILE: projects/uncleaned_train/motionrep/fields/offset_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple, List
from motionrep.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from motionrep.field_components.mlp import MLP
from motionrep.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from motionrep.data.scene_box import SceneBox


class TemporalKplanesOffsetfields(nn.Module):
    """Temporal Offsets fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        add_spatial_triplane: bool = True,
        zero_init: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = 3

        self.temporal_kplanes_encoding = TemporalKplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding = TriplanesEncoding(
                resolutions[:-1], feat_dim, init_a, init_b, reduce
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 4"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)
        output = self.temporal_kplanes_encoding(inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding(inpx)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        return output

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.temporal_kplanes_encoding.compute_plane_tv()
        temporal_smoothness_loss = (
            self.temporal_kplanes_encoding.compute_temporal_smoothness()
        )

        if self.add_spatial_triplane:
            smothness_loss += self.spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss + temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        output = self(inp)

        rec_traj = inpx + output

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss

    def arap_loss(self, inp):
        pass

    def forward_with_plane_coefs(
        self,
        plane_coefs: List[Float[Tensor, "feat_dim H W"]],
        inp: Float[Tensor, "*bs 4"],
    ):
        """
        Args:
            pass
        """

        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)
        output = self.temporal_kplanes_encoding.functional_forward(
            plane_coefs, inp, reduce=self.temporal_kplanes_encoding.reduce
        )

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding(inpx)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        return output


================================================
FILE: projects/uncleaned_train/motionrep/fields/se3_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Literal, Optional, Sequence, Tuple
from motionrep.field_components.encoding import (
    TemporalKplanesEncoding,
    TriplanesEncoding,
)
from motionrep.field_components.mlp import MLP
from motionrep.operators.rotation import rotation_6d_to_matrix, quaternion_to_matrix
from motionrep.data.scene_box import SceneBox


class TemporalKplanesSE3fields(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z ,t].

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        rotation_type: Literal["quaternion", "6d"] = "6d",
        add_spatial_triplane: bool = True,
        zero_init: bool = True,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        output_dim_dict = {"quaternion": 4 + 3, "6d": 6 + 3}
        self.output_dim = output_dim_dict[rotation_type]
        self.rotation_type = rotation_type

        self.temporal_kplanes_encoding = TemporalKplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.add_spatial_triplane = add_spatial_triplane
        if add_spatial_triplane:
            self.spatial_kplanes_encoding = TriplanesEncoding(
                resolutions[:-1], feat_dim, init_a, init_b, reduce
            )
            feat_dim = feat_dim * 2

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self,
        inp: Float[Tensor, "*bs 4"],
        compute_smoothess_loss: bool = False,
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        if compute_smoothess_loss:
            smothness_loss, temporal_smoothness_loss = self.compute_smoothess_loss()
            return smothness_loss + temporal_smoothness_loss
        inpx, inpt = inp[:, :3], inp[:, 3:]

        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inpx, self.aabb) * 2.0 - 1.0

        inpt = inpt * 2.0 - 1.0

        inp = torch.cat([inpx, inpt], dim=-1)
        output = self.temporal_kplanes_encoding(inp)

        if self.add_spatial_triplane:
            spatial_output = self.spatial_kplanes_encoding(inpx)
            output = torch.cat([output, spatial_output], dim=-1)

        output = self.decoder(output)

        if self.rotation_type == "6d":
            rotation_6d, translation = output[:, :6], output[:, 6:]
            R_mat = rotation_6d_to_matrix(rotation_6d)

        elif self.rotation_type == "quaternion":
            quat, translation = output[:, :4], output[:, 4:]

            # tanh and normalize
            quat = torch.tanh(quat)

            R_mat = quaternion_to_matrix(quat)

            # --------------- remove below --------------- #
            # add normalization
            # r = quat
            # norm = torch.sqrt(
            #     r[:, 0] * r[:, 0]
            #     + r[:, 1] * r[:, 1]
            #     + r[:, 2] * r[:, 2]
            #     + r[:, 3] * r[:, 3]
            # )
            # q = r / norm[:, None]
            # R_mat = q
            # --------------- remove above --------------- #

        return R_mat, translation

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.temporal_kplanes_encoding.compute_plane_tv()
        temporal_smoothness_loss = (
            self.temporal_kplanes_encoding.compute_temporal_smoothness()
        )

        if self.add_spatial_triplane:
            smothness_loss += self.spatial_kplanes_encoding.compute_plane_tv()

        return smothness_loss, temporal_smoothness_loss

    def compute_loss(
        self,
        inp: Float[Tensor, "*bs 4"],
        trajectory: Float[Tensor, "*bs 3"],
        loss_func,
    ):
        inpx, inpt = inp[:, :3], inp[:, 3:]

        R, t = self(inp)

        rec_traj = torch.bmm(R, inpx.unsqueeze(-1)).squeeze(-1) + t

        rec_loss = loss_func(rec_traj, trajectory)

        return rec_loss


================================================
FILE: projects/uncleaned_train/motionrep/fields/triplane_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Optional, Sequence, Tuple, List
from motionrep.field_components.encoding import TriplanesEncoding
from motionrep.field_components.mlp import MLP
from motionrep.data.scene_box import SceneBox


class TriplaneFields(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z]

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  #: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        output_dim: int = 96,
        zero_init: bool = False,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = output_dim

        self.kplanes_encoding = TriplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 3"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "*bs 3"]]:
        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        output = self.kplanes_encoding(inpx)

        output = self.decoder(output)

        # split_size = output.shape[-1] // 3
        # output = torch.stack(torch.split(output, split_size, dim=-1), dim=-1)

        return output

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.kplanes_encoding.compute_plane_tv()

        return smothness_loss


def compute_entropy(p):
    return -torch.sum(p * torch.log(p + 1e-5), dim=1).mean()  # Adding a small constant to prevent log(0)


class TriplaneFieldsWithEntropy(nn.Module):
    """Temporal Kplanes SE(3) fields.

    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,z) point.
            aabb[1] is the maximum (x,y,z) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, z]

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="sum",  #: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        output_dim: int = 96,
        zero_init: bool = False,
        num_cls: int = 3,
    ):
        super().__init__()

        self.register_buffer("aabb", aabb)
        self.output_dim = output_dim
        self.num_cls = num_cls

        self.kplanes_encoding = TriplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        self.decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=self.num_cls,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

        self.cls_embedding = torch.nn.Embedding(num_cls, output_dim)

    def forward(
        self, inp: Float[Tensor, "*bs 3"]
    ) -> Tuple[Float[Tensor, "*bs 3 3"], Float[Tensor, "1"]]:
        # shift to [-1, 1]
        inpx = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        output = self.kplanes_encoding(inpx)

        output = self.decoder(output)

        prob = F.softmax(output, dim=-1)

        entropy = compute_entropy(prob)

        cls_index = torch.tensor([0, 1, 2]).to(inp.device)
        cls_emb = self.cls_embedding(cls_index)

        output = torch.matmul(prob, cls_emb)

        
        return output, entropy

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.kplanes_encoding.compute_plane_tv()

        return smothness_loss


================================================
FILE: projects/uncleaned_train/motionrep/fields/video_triplane_disp_field.py
================================================
import torch
import torch.nn.functional as F
from jaxtyping import Float, Int, Shaped
from torch import Tensor, nn
from typing import Optional, Sequence, Tuple, List
from motionrep.field_components.encoding import (
    TriplanesEncoding,
    PlaneEncoding,
    TemporalNeRFEncoding,
)
from motionrep.field_components.mlp import MLP
from motionrep.data.scene_box import SceneBox
from einops import rearrange, repeat


class TriplaneDispFields(nn.Module):
    """Kplanes Displacement fields.
        [x, t, t] => [dx, dy]
    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,t) point.
            aabb[1] is the maximum (x,y,t) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, t]

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="cat",  #: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        output_dim: int = 2,
        zero_init: bool = False,
    ):
        super().__init__()

        if aabb is None:
            aabb = (
                torch.tensor([[-1.0, -1.0, -1.0], [1.0, 1.0, 1.0]], dtype=torch.float32)
                * 1.1
            )

        self.register_buffer("aabb", aabb)
        self.output_dim = output_dim

        self.canonical_encoding = PlaneEncoding(
            resolutions[:2], feat_dim, init_a, init_b
        )
        self.canonical_decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=3,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
        )

        self.kplanes_encoding = TriplanesEncoding(
            resolutions, feat_dim, init_a, init_b, reduce
        )

        if reduce == "cat":
            feat_dim = int(feat_dim * 3)

        self.decoder = MLP(
            feat_dim,
            int(num_decoder_layers * 3),
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=(2, 4),
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 3"]
    ) -> Tuple[Float[Tensor, "*bs 2"], Float[Tensor, "*bs 3"]]:
        # shift to [-1, 1]
        inp_norm = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        output = self.kplanes_encoding(inp_norm)

        # [*bs, 2]
        output = self.decoder(output)

        inpyx = inp_norm[..., :2].reshape(-1, 2)

        canonical_yx = inpyx + output

        ret_rgb_feat = self.canonical_encoding(canonical_yx)
        ret_rgb = self.canonical_decoder(ret_rgb_feat)

        return output, ret_rgb

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.kplanes_encoding.compute_plane_tv()

        smothness_canonical = self.canonical_encoding.compute_plane_tv()
        return smothness_loss + smothness_canonical

    def get_canonical(
        self, canonical_grid: Float[Tensor, "*bs 2"]
    ) -> Float[Tensor, "*bs 3"]:
        pad_can_grid = torch.cat(
            [canonical_grid, torch.zeros_like(canonical_grid[..., :1])], dim=-1
        )
        pad_can_norm = (
            SceneBox.get_normalized_positions(pad_can_grid, self.aabb) * 2.0 - 1.0
        )

        inp_can_grid = pad_can_norm[..., :2]

        ret_rgb_feat = self.canonical_encoding(inp_can_grid)
        ret_rgb = self.canonical_decoder(ret_rgb_feat)

        return ret_rgb

    def sample_canonical(
        self,
        inp: Float[Tensor, "bs hw 3"],
        canonical_frame: Float[Tensor, "1 H W 3"],
        canonical_grid_yx: Float[Tensor, "bs hw 2"],
    ) -> Float[Tensor, "bs h w 3"]:
        #
        inp_norm = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        output = self.kplanes_encoding(inp_norm)

        # [-1, 2]
        output = self.decoder(output)

        inpyx = inp_norm[..., :2].reshape(-1, 2)

        canonical_yx = inpyx + output
        canonical_yx = canonical_yx * 1.1

        can_ymin, can_ymax = (
            canonical_grid_yx[..., 0].min(),
            canonical_grid_yx[..., 0].max(),
        )
        can_xmin, can_xmax = (
            canonical_grid_yx[..., 1].min(),
            canonical_grid_yx[..., 1].max(),
        )
        canonical_yx[..., 0] = (canonical_yx[..., 0] - can_ymin) / (
            can_ymax - can_ymin
        ) * 2.0 - 1.0
        canonical_yx[..., 1] = (canonical_yx[..., 1] - can_xmin) / (
            can_xmax - can_xmin
        ) * 2.0 - 1.0

        canonical_xy = torch.cat(
            [canonical_yx[..., 1:2], canonical_yx[..., 0:1]], dim=-1
        )
        # use grid sample to sample the canonical frame

        # [B, C, H, W]
        canonical_frame = canonical_frame.permute(0, 3, 1, 2).expand(
            inp.shape[0], -1, -1, -1
        )
        H, W = canonical_frame.shape[-2:]
        canonical_xy = canonical_xy.reshape(-1, H, W, 2)

        rec = F.grid_sample(canonical_frame, canonical_xy, align_corners=True)

        rec = rearrange(rec, "b c h w -> b h w c")

        return rec


class PlaneDynamicDispFields(nn.Module):
    """Plane Displacement fields.
        [x, t, t] => [dx, dy]
    Args:
        aabb: axis-aligned bounding box.
            aabb[0] is the minimum (x,y,t) point.
            aabb[1] is the maximum (x,y,t) point.
        resolutions: resolutions of the kplanes. in an order of [x, y, t]

    """

    def __init__(
        self,
        aabb: Float[Tensor, "2 3"],
        resolutions: Sequence[int],
        feat_dim: int = 64,
        init_a: float = 0.1,
        init_b: float = 0.5,
        reduce="cat",  #: Literal["sum", "product", "cat"] = "sum",
        num_decoder_layers=2,
        decoder_hidden_size=64,
        output_dim: int = 2,
        zero_init: bool = False,
        num_temporal_freq: int = 20,
        freq_min: float = 0.0,
        freq_max: float = 20,
    ):
        super().__init__()

        if aabb is None:
            aabb = (
                torch.tensor([[-1.0, -1.0, -1.0], [1.0, 1.0, 1.0]], dtype=torch.float32)
                * 1.1
            )

        self.register_buffer("aabb", aabb)
        self.output_dim = output_dim

        self.canonical_encoding = PlaneEncoding(
            resolutions[:2], feat_dim, init_a, init_b
        )
        self.canonical_decoder = MLP(
            feat_dim,
            num_decoder_layers,
            layer_width=decoder_hidden_size,
            out_dim=3,
            skip_connections=None,
            activation=nn.ReLU(),
            out_activation=None,
        )

        self.deform_planes_encoding = PlaneEncoding(
            resolutions[:2], feat_dim, init_a, init_b
        )

        self.num_temporal_freq = num_temporal_freq
        self.temporal_pos_encoding = TemporalNeRFEncoding(
            1,
            num_temporal_freq,
            freq_min,
            freq_max,
            log_scale=False,
        )

        self.decoder = MLP(
            feat_dim + self.temporal_pos_encoding.get_out_dim(),
            int(num_decoder_layers * 3),
            layer_width=decoder_hidden_size,
            out_dim=self.output_dim,
            skip_connections=(2, 4),
            activation=nn.ReLU(),
            out_activation=None,
            zero_init=zero_init,
        )

    def forward(
        self, inp: Float[Tensor, "*bs 3"]
    ) -> Tuple[Float[Tensor, "*bs 2"], Float[Tensor, "*bs 3"]]:
        # shift to [-1, 1]
        inp_norm = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        inp_yx, inp_t = inp_norm[..., 0:2], inp_norm[..., 2:3]

        spatial_feat = self.deform_planes_encoding(inp_yx)

        temporal_enc = self.temporal_pos_encoding(inp_t)
        # [*bs, 2]

        output = self.decoder(
            torch.cat(
                [spatial_feat, temporal_enc.view(-1, temporal_enc.shape[-1])], dim=-1
            )
        )

        canonical_yx = inp_yx.reshape(-1, 2) + output

        ret_rgb_feat = self.canonical_encoding(canonical_yx)
        ret_rgb = self.canonical_decoder(ret_rgb_feat)

        return output, ret_rgb

    def compute_smoothess_loss(
        self,
    ):
        smothness_loss = self.deform_planes_encoding.compute_plane_tv()

        smothness_canonical = self.canonical_encoding.compute_plane_tv()
        return smothness_loss + smothness_canonical

    def get_canonical(
        self, canonical_grid: Float[Tensor, "*bs 2"]
    ) -> Float[Tensor, "*bs 3"]:
        pad_can_grid = torch.cat(
            [canonical_grid, torch.zeros_like(canonical_grid[..., :1])], dim=-1
        )
        pad_can_norm = (
            SceneBox.get_normalized_positions(pad_can_grid, self.aabb) * 2.0 - 1.0
        )

        inp_can_grid = pad_can_norm[..., :2]

        ret_rgb_feat = self.canonical_encoding(inp_can_grid)
        ret_rgb = self.canonical_decoder(ret_rgb_feat)

        return ret_rgb

    def sample_canonical(
        self,
        inp: Float[Tensor, "bs hw 3"],
        canonical_frame: Float[Tensor, "1 H W 3"],
        canonical_grid_yx: Float[Tensor, "bs hw 2"],
    ) -> Float[Tensor, "bs h w 3"]:
        inp_norm = SceneBox.get_normalized_positions(inp, self.aabb) * 2.0 - 1.0

        inp_yx, inp_t = inp_norm[..., 0:2], inp_norm[..., 2:3]
        inp_yx = inp_yx.reshape(-1, 2)
        spatial_feat = self.deform_planes_encoding(inp_yx)

        temporal_enc = self.temporal_pos_encoding(inp_t.view(-1, 1))
        # [*bs, 2]
        output = self.decoder(torch.cat([spatial_feat, temporal_enc], dim=-1))

        canonical_yx = inp_yx + output
        canonical_yx = canonical_yx * 1.1

        can_ymin, can_ymax = (
            canonical_grid_yx[..., 0].min(),
            canonical_grid_yx[..., 0].max(),
        )
        can_xmin, can_xmax = (
            canonical_grid_yx[..., 1].min(),
            canonical_grid_yx[..., 1].max(),
        )
        canonical_yx[..., 0] = (canonical_yx[..., 0] - can_ymin) / (
            can_ymax - can_ymin
        ) * 2.0 - 1.0
        canonical_yx[..., 1] = (canonical_yx[..., 1] - can_xmin) / (
            can_xmax - can_xmin
        ) * 2.0 - 1.0

        canonical_xy = torch.cat(
            [canonical_yx[..., 1:2], canonical_yx[..., 0:1]], dim=-1
        )
        # use grid sample to sample the canonical frame

        # [B, C, H, W]
        canonical_frame = canonical_frame.permute(0, 3, 1, 2).expand(
            inp.shape[0], -1, -1, -1
        )
        H, W = canonical_frame.shape[-2:]
        canonical_xy = canonical_xy.reshape(-1, H, W, 2)

        rec = F.grid_sample(canonical_frame, canonical_xy, align_corners=True)

        rec = rearrange(rec, "b c h w -> b h w c")

        return rec


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/arguments/__init__.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

from argparse import ArgumentParser, Namespace
import sys
import os

class GroupParams:
    pass

class ParamGroup:
    def __init__(self, parser: ArgumentParser, name : str, fill_none = False):
        group = parser.add_argument_group(name)
        for key, value in vars(self).items():
            shorthand = False
            if key.startswith("_"):
                shorthand = True
                key = key[1:]
            t = type(value)
            value = value if not fill_none else None 
            if shorthand:
                if t == bool:
                    group.add_argument("--" + key, ("-" + key[0:1]), default=value, action="store_true")
                else:
                    group.add_argument("--" + key, ("-" + key[0:1]), default=value, type=t)
            else:
                if t == bool:
                    group.add_argument("--" + key, default=value, action="store_true")
                else:
                    group.add_argument("--" + key, default=value, type=t)

    def extract(self, args):
        group = GroupParams()
        for arg in vars(args).items():
            if arg[0] in vars(self) or ("_" + arg[0]) in vars(self):
                setattr(group, arg[0], arg[1])
        return group

class ModelParams(ParamGroup): 
    def __init__(self, parser, sentinel=False):
        self.sh_degree = 3
        self._source_path = ""
        self._model_path = ""
        self._images = "images"
        self._resolution = -1
        self._white_background = False
        self.data_device = "cuda"
        self.eval = False
        super().__init__(parser, "Loading Parameters", sentinel)

    def extract(self, args):
        g = super().extract(args)
        g.source_path = os.path.abspath(g.source_path)
        return g

class PipelineParams(ParamGroup):
    def __init__(self, parser):
        self.convert_SHs_python = False
        self.compute_cov3D_python = False
        self.debug = False
        super().__init__(parser, "Pipeline Parameters")

class OptimizationParams(ParamGroup):
    def __init__(self, parser):
        self.iterations = 30_000
        self.position_lr_init = 0.00016
        self.position_lr_final = 0.0000016
        self.position_lr_delay_mult = 0.01
        self.position_lr_max_steps = 30_000
        self.feature_lr = 0.0025
        self.opacity_lr = 0.05
        self.scaling_lr = 0.005
        self.rotation_lr = 0.001
        self.percent_dense = 0.01
        self.lambda_dssim = 0.2
        self.densification_interval = 100
        self.opacity_reset_interval = 3000
        self.densify_from_iter = 500
        self.densify_until_iter = 15_000
        self.densify_grad_threshold = 0.0002
        super().__init__(parser, "Optimization Parameters")

def get_combined_args(parser : ArgumentParser):
    cmdlne_string = sys.argv[1:]
    cfgfile_string = "Namespace()"
    args_cmdline = parser.parse_args(cmdlne_string)

    try:
        cfgfilepath = os.path.join(args_cmdline.model_path, "cfg_args")
        print("Looking for config file in", cfgfilepath)
        with open(cfgfilepath) as cfg_file:
            print("Config file found: {}".format(cfgfilepath))
            cfgfile_string = cfg_file.read()
    except TypeError:
        print("Config file not found at")
        pass
    args_cfgfile = eval(cfgfile_string)

    merged_dict = vars(args_cfgfile).copy()
    for k,v in vars(args_cmdline).items():
        if v != None:
            merged_dict[k] = v
    return Namespace(**merged_dict)


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/gaussian_renderer/__init__.py
================================================


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/gaussian_renderer/depth_uv_render.py
================================================
import torch
from motionrep.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from typing import Callable


def render_uv_depth_w_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_disp: [N, 3]
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    
    shs = None
    colors_precomp = None
    
    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]

    R = w2c[:3, :3].unsqueeze(0)  # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0)  # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([pc._xyz, torch.ones_like(pc._xyz[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1)  # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0]  # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())

   # [N, 2]
    pts_cam_xy = pts_cam[:, :2, 0] / depth.unsqueeze(-1)
    
    
    pts_cam_xy_pixel = cam_plane_2_img.unsqueeze(0) @ pts_cam_xy.unsqueeze(-1)  # [N, 2, 1]
    pts_cam_xy_pixel = pts_cam_xy_pixel.squeeze(-1)  # [N, 2]

    colors_precomp = torch.cat(
        [pts_cam_xy_pixel, depth.unsqueeze(dim=-1)], dim=-1
    )  # [N, 3]

    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    return {
        "render": rendered_image,
        "visibility_filter": radii > 0,
        "radii": radii,
        "pts_depth": depth,
        "pts_cam_xy_pixel": pts_cam_xy_pixel,
    }


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/gaussian_renderer/feat_render.py
================================================
import torch
from motionrep.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from typing import Callable


def render_feat_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    bg_color: torch.Tensor,
    points_feat: torch.Tensor,
    scaling_modifier=1.0,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_disp: [N, 3]
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    
    shs = None
    colors_precomp = points_feat
    assert (points_feat.shape[1] == 3) and (points_feat.shape[0] == means3D.shape[0])

    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    return {
        "render": rendered_image,
        "visibility_filter": radii > 0,
        "radii": radii,
    }


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/gaussian_renderer/flow_depth_render.py
================================================
import torch
from motionrep.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from typing import Callable


def render_flow_depth_w_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    point_disp: torch.Tensor,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_disp: [N, 3]
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    shs = None
    colors_precomp = None

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]

    R = w2c[:3, :3].unsqueeze(0)  # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0)  # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([pc._xyz, torch.ones_like(pc._xyz[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1)  # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0]  # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())

    point_disp_pad = torch.cat(
        [point_disp, torch.zeros_like(point_disp[:, 0:1])], dim=-1
    )  # [N, 4]

    pts_motion = w2c.unsqueeze(0) @ point_disp_pad.unsqueeze(-1)  # [N, 4, 1]

    # [N, 2]
    pts_motion_xy = pts_motion[:, :2, 0] / depth.unsqueeze(-1)

    pts_motion_xy_pixel = cam_plane_2_img.unsqueeze(0) @ pts_motion_xy.unsqueeze(
        -1
    )  # [N, 2, 1]
    pts_motion_xy_pixel = pts_motion_xy_pixel.squeeze(-1)  # [N, 2]

    colors_precomp = torch.cat(
        [pts_motion_xy_pixel, depth.unsqueeze(dim=-1)], dim=-1
    )  # [N, 3]

    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    # return {
    #     "render": rendered_image,
    #     "viewspace_points": screenspace_points,
    #     "visibility_filter": radii > 0,
    #     "radii": radii,
    # }

    return {"render": rendered_image}


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/gaussian_renderer/motion_renderer.py
================================================
import torch
from motionrep.gaussian_3d.scene.gaussian_model import GaussianModel
import math

from diff_gaussian_rasterization_wmotion import GaussianRasterizationWMotionSettings as GaussianRasterizationSettings_wmotion
from diff_gaussian_rasterization_wmotion import GaussianRasterizerWMotion as GaussianRasterizer_wmotion
from typing import Callable

def render_motion_w_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    motion_fields: Callable,
    pipe,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
    point_motion=None,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!

    Args:
        point_motion: [N, num_feat, 3] or None
            if None.  motion_fields will be called to sample point motion
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings_wmotion = GaussianRasterizationSettings_wmotion(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer_wm = GaussianRasterizer_wmotion(raster_settings=raster_settings_wmotion)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None
    cov3D_precomp = None
    if pipe.compute_cov3D_python:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    else:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    shs = None
    colors_precomp = None
    if point_motion is None:
        xyz = pc._xyz 
        # [N, num_feat, 3]
        point_motion = motion_fields(xyz) 

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img # [2, 2]
    
    R = w2c[:3, :3].unsqueeze(0) # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0) # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([pc._xyz, torch.ones_like(pc._xyz[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1) # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0] # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())
    
    # pts = torch.cat([pc._xyz, torch.ones_like(pc._xyz[:, 0:1])], dim=-1)
    # cam_pts = pts.unsqueeze(1) @ viewpoint_camera.full_proj_transform.unsqueeze(0) # [N, 1, 4] @ [N, 1, 4]
    # cam_pts = cam_pts.squeeze(1) # [N, 4]
    # depth = cam_pts[:, 3] # [N]
    
    point_motion_pad = torch.cat([point_motion, torch.zeros_like(point_motion[:, :, 0:1])], dim=-1) # [N, num_feat, 4]

    pts_motion = w2c.unsqueeze(0).unsqueeze(0) @ point_motion_pad.unsqueeze(-1) # [N, num_feat, 4, 1] 
    # pts_motion = R.unsqueeze(1) @ (point_motion.unsqueeze(-1)) # [N, num_feat, 3, 1]
    # [N, num_feat, 2]
    pts_motion_xy = pts_motion[:, :, :2, 0] / depth.unsqueeze(-1).unsqueeze(-1) 
    # [N, num_feat, 2]


    pts_motion_xy_pixel = cam_plane_2_img.unsqueeze(0).unsqueeze(0) @ pts_motion_xy.unsqueeze(-1) # [N, num_feat, 2, 1]
    pts_motion_xy_pixel = pts_motion_xy_pixel.squeeze(-1) # [N, num_feat, 2]
    pts_motion = pts_motion_xy_pixel.flatten(1, 2) # [N, num_feat * 2]

    colors_precomp = pts_motion
    
    # print("converted 2D motion precompute: ", colors_precomp.shape, shs, colors_precomp.max(), colors_precomp.min(), colors_precomp.mean())
    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer_wm(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.

    # return {
    #     "render": rendered_image,
    #     "viewspace_points": screenspace_points,
    #     "visibility_filter": radii > 0,
    #     "radii": radii,
    # }

    return {"render": rendered_image}


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/gaussian_renderer/render.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import math
from diff_gaussian_rasterization import (
    GaussianRasterizationSettings,
    GaussianRasterizer,
)
from motionrep.gaussian_3d.scene.gaussian_model import GaussianModel


def render_gaussian(
    viewpoint_camera,
    pc: GaussianModel,
    pipe,
    bg_color: torch.Tensor,
    scaling_modifier=1.0,
    override_color=None,
    cov3D_precomp=None,
):
    """
    Render the scene.

    Background tensor (bg_color) must be on GPU!
    """

    # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means
    screenspace_points = (
        torch.zeros_like(
            pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda"
        )
        + 0
    )
    try:
        screenspace_points.retain_grad()
    except:
        pass

    # Set up rasterization configuration
    tanfovx = math.tan(viewpoint_camera.FoVx * 0.5)
    tanfovy = math.tan(viewpoint_camera.FoVy * 0.5)

    raster_settings = GaussianRasterizationSettings(
        image_height=int(viewpoint_camera.image_height),
        image_width=int(viewpoint_camera.image_width),
        tanfovx=tanfovx,
        tanfovy=tanfovy,
        bg=bg_color,
        scale_modifier=scaling_modifier,
        viewmatrix=viewpoint_camera.world_view_transform,
        projmatrix=viewpoint_camera.full_proj_transform,
        sh_degree=pc.active_sh_degree,
        campos=viewpoint_camera.camera_center,
        prefiltered=False,
        debug=pipe.debug,
    )

    rasterizer = GaussianRasterizer(raster_settings=raster_settings)

    means3D = pc.get_xyz
    means2D = screenspace_points
    opacity = pc.get_opacity

    # If precomputed 3d covariance is provided, use it. If not, then it will be computed from
    # scaling / rotation by the rasterizer.
    scales = None
    rotations = None

    if pipe.compute_cov3D_python or cov3D_precomp is None:
        cov3D_precomp = pc.get_covariance(scaling_modifier)
    elif cov3D_precomp is None:
        scales = pc.get_scaling
        rotations = pc.get_rotation

    # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors
    # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer.
    shs = None
    colors_precomp = None
    if override_color is None:
        if pipe.convert_SHs_python:
            shs_view = pc.get_features.transpose(1, 2).view(
                -1, 3, (pc.max_sh_degree + 1) ** 2
            )
            dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat(
                pc.get_features.shape[0], 1
            )
            dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True)
            sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
            colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
        else:
            shs = pc.get_features
    else:
        colors_precomp = override_color

    # Rasterize visible Gaussians to image, obtain their radii (on screen).
    rendered_image, radii = rasterizer(
        means3D=means3D,
        means2D=means2D,
        shs=shs,
        colors_precomp=colors_precomp,
        opacities=opacity,
        scales=scales,
        rotations=rotations,
        cov3D_precomp=cov3D_precomp,
    )

    # Those Gaussians that were frustum culled or had a radius of 0 were not visible.
    # They will be excluded from value updates used in the splitting criteria.
    return {
        "render": rendered_image,
        "viewspace_points": screenspace_points,
        "visibility_filter": radii > 0,
        "radii": radii,
    }
    # return {"render": rendered_image}


def gaussian_intrin_scale(x_or_y: torch.Tensor, w_or_h: float):

    ret = ((x_or_y + 1.0) * w_or_h - 1.0) * 0.5

    return ret


def render_arrow_in_screen(viewpoint_camera, points_3d):

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]
    cam_plane_2_img = viewpoint_camera.projection_matrix.transpose(0, 1)  # [4, 4]

    full_proj_mat = viewpoint_camera.full_proj_transform

    # [N, 4]
    pts = torch.cat([points_3d, torch.ones_like(points_3d[:, 0:1])], dim=-1)
    # [N, 1, 4] <-  [N, 1, 4] @ [1, 4, 4]
    pts_cam = pts.unsqueeze(-2) @ full_proj_mat.unsqueeze(0)  # [N, 1, 4]

    # start here

    # pts: [N, 4]
    # [1, 4, 4] @ [N, 4, 1] -> [N, 4, 1]
    # from IPython import embed

    # embed()
    # pts_cam = torch.bmm(
    #     full_proj_mat.T.unsqueeze(0), pts.unsqueeze(-1)
    # )  # K*[R,T]*[x,y,z,1]^T to get 2D projection of Gaussians
    # end here
    pts_cam = full_proj_mat.T.unsqueeze(0) @ pts.unsqueeze(-1)

    # print(pts_cam.shape)

    pts_cam = pts_cam.squeeze(-1)  # [N, 4]
    pts_cam = pts_cam[:, :3] / pts_cam[:, 3:]  # [N, 1, 3]

    # print(pts_cam, "after proj")

    pts_cam_yx_pixel = pts_cam[:, :2]
    #  [N, 2] yx => xy
    # pts_cam_xy_pixel = torch.cat(
    #     [pts_cam_xy_pixel[:, [1]], pts_cam_xy_pixel[:, [0]]], dim=-1
    # )

    pts_cam_x, pts_cam_y = pts_cam_yx_pixel[:, 0], pts_cam_yx_pixel[:, 1]

    w, h = viewpoint_camera.image_width, viewpoint_camera.image_height

    pts_cam_x = gaussian_intrin_scale(pts_cam_x, w)
    pts_cam_y = gaussian_intrin_scale(pts_cam_y, h)

    ret_pts_cam_xy = torch.cat(
        [pts_cam_x.unsqueeze(-1), pts_cam_y.unsqueeze(-1)], dim=-1
    )

    # print(ret_pts_cam_xy)

    return ret_pts_cam_xy


def render_arrow_in_screen_back(viewpoint_camera, points_3d):

    # project point motion to 2D using camera:
    w2c = viewpoint_camera.world_view_transform.transpose(0, 1)
    cam_plane_2_img = viewpoint_camera.cam_plane_2_img  # [2, 2]
    cam_plane_2_img = viewpoint_camera.projection_matrix.transpose(0, 1)

    from IPython import embed

    embed()

    R = w2c[:3, :3].unsqueeze(0)  # [1, 3, 3]
    t = w2c[:3, 3].unsqueeze(0)  # [1, 3]

    # [N, 3, 1]
    pts = torch.cat([points_3d, torch.ones_like(points_3d[:, 0:1])], dim=-1)
    pts_cam = w2c.unsqueeze(0) @ pts.unsqueeze(-1)  # [N, 4, 1]
    # pts_cam = R @ (pc._xyz.unsqueeze(-1)) + t[:, None]
    depth = pts_cam[:, 2, 0]  # [N]
    # print("depth", depth.shape, depth.max(), depth.mean(), depth.min())

    # [N, 2]
    pts_cam_xy = pts_cam[:, :2, 0] / depth.unsqueeze(-1)

    pts_cam_xy_pixel = cam_plane_2_img.unsqueeze(0) @ pts_cam_xy.unsqueeze(
        -1
    )  # [N, 2, 1]
    pts_cam_xy_pixel = pts_cam_xy_pixel.squeeze(-1)  # [N, 2]

    #  [N, 2] yx => xy
    pts_cam_xy_pixel = torch.cat(
        [pts_cam_xy_pixel[:, [1]], pts_cam_xy_pixel[:, [0]]], dim=-1
    )

    return pts_cam_xy_pixel


# for spherecal harmonics


C0 = 0.28209479177387814
C1 = 0.4886025119029199
C2 = [
    1.0925484305920792,
    -1.0925484305920792,
    0.31539156525252005,
    -1.0925484305920792,
    0.5462742152960396,
]
C3 = [
    -0.5900435899266435,
    2.890611442640554,
    -0.4570457994644658,
    0.3731763325901154,
    -0.4570457994644658,
    1.445305721320277,
    -0.5900435899266435,
]
C4 = [
    2.5033429417967046,
    -1.7701307697799304,
    0.9461746957575601,
    -0.6690465435572892,
    0.10578554691520431,
    -0.6690465435572892,
    0.47308734787878004,
    -1.7701307697799304,
    0.6258357354491761,
]


def eval_sh(deg, sh, dirs):
    """
    Evaluate spherical harmonics at unit directions
    using hardcoded SH polynomials.
    Works with torch/np/jnp.
    ... Can be 0 or more batch dimensions.
    Args:
        deg: int SH deg. Currently, 0-3 supported
        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
        dirs: jnp.ndarray unit directions [..., 3]
    Returns:
        [..., C]
    """
    assert deg <= 4 and deg >= 0
    coeff = (deg + 1) ** 2
    assert sh.shape[-1] >= coeff

    result = C0 * sh[..., 0]
    if deg > 0:
        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
        result = (
            result - C1 * y * sh[..., 1] + C1 * z * sh[..., 2] - C1 * x * sh[..., 3]
        )

        if deg > 1:
            xx, yy, zz = x * x, y * y, z * z
            xy, yz, xz = x * y, y * z, x * z
            result = (
                result
                + C2[0] * xy * sh[..., 4]
                + C2[1] * yz * sh[..., 5]
                + C2[2] * (2.0 * zz - xx - yy) * sh[..., 6]
                + C2[3] * xz * sh[..., 7]
                + C2[4] * (xx - yy) * sh[..., 8]
            )

            if deg > 2:
                result = (
                    result
                    + C3[0] * y * (3 * xx - yy) * sh[..., 9]
                    + C3[1] * xy * z * sh[..., 10]
                    + C3[2] * y * (4 * zz - xx - yy) * sh[..., 11]
                    + C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12]
                    + C3[4] * x * (4 * zz - xx - yy) * sh[..., 13]
                    + C3[5] * z * (xx - yy) * sh[..., 14]
                    + C3[6] * x * (xx - 3 * yy) * sh[..., 15]
                )

                if deg > 3:
                    result = (
                        result
                        + C4[0] * xy * (xx - yy) * sh[..., 16]
                        + C4[1] * yz * (3 * xx - yy) * sh[..., 17]
                        + C4[2] * xy * (7 * zz - 1) * sh[..., 18]
                        + C4[3] * yz * (7 * zz - 3) * sh[..., 19]
                        + C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20]
                        + C4[5] * xz * (7 * zz - 3) * sh[..., 21]
                        + C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22]
                        + C4[7] * xz * (xx - 3 * yy) * sh[..., 23]
                        + C4[8]
                        * (xx * (xx - 3 * yy) - yy * (3 * xx - yy))
                        * sh[..., 24]
                    )
    return result


def RGB2SH(rgb):
    return (rgb - 0.5) / C0


def SH2RGB(sh):
    return sh * C0 + 0.5


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/__init__.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import os
import random
import numpy as np
import json
from motionrep.gaussian_3d.utils.system_utils import searchForMaxIteration
from motionrep.gaussian_3d.scene.dataset_readers import sceneLoadTypeCallbacks
from motionrep.gaussian_3d.scene.gaussian_model import GaussianModel
from motionrep.gaussian_3d.arguments import ModelParams
from motionrep.gaussian_3d.utils.camera_utils import (
    cameraList_from_camInfos,
    camera_to_JSON,
)


class Scene:
    gaussians: GaussianModel

    def __init__(
        self,
        args: ModelParams,
        gaussians: GaussianModel,
        load_iteration=None,
        shuffle=True,
        resolution_scales=[1.0],
    ):
        """b
        :param path: Path to colmap scene main folder.
        """
        self.model_path = args.model_path
        self.loaded_iter = None
        self.gaussians = gaussians

        if load_iteration:
            if load_iteration == -1:
                self.loaded_iter = searchForMaxIteration(
                    os.path.join(self.model_path, "point_cloud")
                )
            else:
                self.loaded_iter = load_iteration
            print("Loading trained model at iteration {}".format(self.loaded_iter))

        self.train_cameras = {}
        self.test_cameras = {}

        if os.path.exists(os.path.join(args.source_path, "sparse")):
            scene_info = sceneLoadTypeCallbacks["Colmap"](
                args.source_path, args.images, args.eval
            )
        elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")):
            print("Found transforms_train.json file, assuming Blender data set!")
            scene_info = sceneLoadTypeCallbacks["Blender"](
                args.source_path, args.white_background, args.eval
            )
        else:
            assert False, "Could not recognize scene type!"

        if not self.loaded_iter:
            with open(scene_info.ply_path, "rb") as src_file, open(
                os.path.join(self.model_path, "input.ply"), "wb"
            ) as dest_file:
                dest_file.write(src_file.read())
            json_cams = []
            camlist = []
            if scene_info.test_cameras:
                camlist.extend(scene_info.test_cameras)
            if scene_info.train_cameras:
                camlist.extend(scene_info.train_cameras)
            for id, cam in enumerate(camlist):
                json_cams.append(camera_to_JSON(id, cam))
            with open(os.path.join(self.model_path, "cameras.json"), "w") as file:
                json.dump(json_cams, file)

        if shuffle:
            random.shuffle(
                scene_info.train_cameras
            )  # Multi-res consistent random shuffling
            random.shuffle(
                scene_info.test_cameras
            )  # Multi-res consistent random shuffling

        self.cameras_extent = scene_info.nerf_normalization["radius"]

        for resolution_scale in resolution_scales:
            print("Loading Training Cameras")
            self.train_cameras[resolution_scale] = cameraList_from_camInfos(
                scene_info.train_cameras, resolution_scale, args
            )
            print("Loading Test Cameras")
            self.test_cameras[resolution_scale] = cameraList_from_camInfos(
                scene_info.test_cameras, resolution_scale, args
            )

        if self.loaded_iter:
            self.gaussians.load_ply(
                os.path.join(
                    self.model_path,
                    "point_cloud",
                    "iteration_" + str(self.loaded_iter),
                    "point_cloud.ply",
                )
            )
        else:
            self.gaussians.create_from_pcd(scene_info.point_cloud, self.cameras_extent)

    def save(self, iteration):
        point_cloud_path = os.path.join(
            self.model_path, "point_cloud/iteration_{}".format(iteration)
        )
        self.gaussians.save_ply(os.path.join(point_cloud_path, "point_cloud.ply"))

    def getTrainCameras(self, scale=1.0):
        return self.train_cameras[scale]

    def getTestCameras(self, scale=1.0):
        return self.test_cameras[scale]


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/cameras.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
from torch import nn
import numpy as np
from motionrep.gaussian_3d.utils.graphics_utils import (
    getWorld2View2,
    getProjectionMatrix,
)


class Camera(nn.Module):
    def __init__(
        self,
        colmap_id,
        R,
        T,
        FoVx,
        FoVy,
        image,
        gt_alpha_mask,
        image_name,
        uid,
        trans=np.array([0.0, 0.0, 0.0]),
        scale=1.0,
        data_device="cuda",
    ):
        super(Camera, self).__init__()

        self.uid = uid
        self.colmap_id = colmap_id
        self.R = R
        self.T = T
        self.FoVx = FoVx
        self.FoVy = FoVy
        self.image_name = image_name

        try:
            self.data_device = torch.device(data_device)
        except Exception as e:
            print(e)
            print(
                f"[Warning] Custom device {data_device} failed, fallback to default cuda device"
            )
            self.data_device = torch.device("cuda")

        self.original_image = image.clamp(0.0, 1.0).to(self.data_device)
        self.image_width = self.original_image.shape[2]
        self.image_height = self.original_image.shape[1]

        if gt_alpha_mask is not None:
            self.original_image *= gt_alpha_mask.to(self.data_device)
        else:
            self.original_image *= torch.ones(
                (1, self.image_height, self.image_width), device=self.data_device
            )

        self.zfar = 100.0
        self.znear = 0.01

        self.trans = trans
        self.scale = scale

        self.world_view_transform = (
            torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda()
        )
        self.projection_matrix = (
            getProjectionMatrix(
                znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy
            )
            .transpose(0, 1)
            .cuda()
        )
        self.full_proj_transform = (
            self.world_view_transform.unsqueeze(0).bmm(
                self.projection_matrix.unsqueeze(0)
            )
        ).squeeze(0)
        self.camera_center = self.world_view_transform.inverse()[3, :3]


class MiniCam:
    def __init__(
        self,
        width,
        height,
        fovy,
        fovx,
        znear,
        zfar,
        world_view_transform,
        full_proj_transform,
    ):
        self.image_width = width
        self.image_height = height
        self.FoVy = fovy
        self.FoVx = fovx
        self.znear = znear
        self.zfar = zfar
        self.world_view_transform = world_view_transform
        self.full_proj_transform = full_proj_transform
        view_inv = torch.inverse(self.world_view_transform)
        self.camera_center = view_inv[3][:3]


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/colmap_loader.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import numpy as np
import collections
import struct

CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"])
Camera = collections.namedtuple(
    "Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
}
CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
                         for camera_model in CAMERA_MODELS])
CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
                           for camera_model in CAMERA_MODELS])


def qvec2rotmat(qvec):
    return np.array([
        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])

def rotmat2qvec(R):
    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
    K = np.array([
        [Rxx - Ryy - Rzz, 0, 0, 0],
        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
    eigvals, eigvecs = np.linalg.eigh(K)
    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
    if qvec[0] < 0:
        qvec *= -1
    return qvec

class Image(BaseImage):
    def qvec2rotmat(self):
        return qvec2rotmat(self.qvec)

def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
    """Read and unpack the next bytes from a binary file.
    :param fid:
    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    :param endian_character: Any of {@, =, <, >, !}
    :return: Tuple of read and unpacked values.
    """
    data = fid.read(num_bytes)
    return struct.unpack(endian_character + format_char_sequence, data)

def read_points3D_text(path):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DText(const std::string& path)
        void Reconstruction::WritePoints3DText(const std::string& path)
    """
    xyzs = None
    rgbs = None
    errors = None
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                xyz = np.array(tuple(map(float, elems[1:4])))
                rgb = np.array(tuple(map(int, elems[4:7])))
                error = np.array(float(elems[7]))
                if xyzs is None:
                    xyzs = xyz[None, ...]
                    rgbs = rgb[None, ...]
                    errors = error[None, ...]
                else:
                    xyzs = np.append(xyzs, xyz[None, ...], axis=0)
                    rgbs = np.append(rgbs, rgb[None, ...], axis=0)
                    errors = np.append(errors, error[None, ...], axis=0)
    return xyzs, rgbs, errors

def read_points3D_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """


    with open(path_to_model_file, "rb") as fid:
        num_points = read_next_bytes(fid, 8, "Q")[0]

        xyzs = np.empty((num_points, 3))
        rgbs = np.empty((num_points, 3))
        errors = np.empty((num_points, 1))

        for p_id in range(num_points):
            binary_point_line_properties = read_next_bytes(
                fid, num_bytes=43, format_char_sequence="QdddBBBd")
            xyz = np.array(binary_point_line_properties[1:4])
            rgb = np.array(binary_point_line_properties[4:7])
            error = np.array(binary_point_line_properties[7])
            track_length = read_next_bytes(
                fid, num_bytes=8, format_char_sequence="Q")[0]
            track_elems = read_next_bytes(
                fid, num_bytes=8*track_length,
                format_char_sequence="ii"*track_length)
            xyzs[p_id] = xyz
            rgbs[p_id] = rgb
            errors[p_id] = error
    return xyzs, rgbs, errors

def read_intrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    cameras = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                camera_id = int(elems[0])
                model = elems[1]
                assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE"
                width = int(elems[2])
                height = int(elems[3])
                params = np.array(tuple(map(float, elems[4:])))
                cameras[camera_id] = Camera(id=camera_id, model=model,
                                            width=width, height=height,
                                            params=params)
    return cameras

def read_extrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    images = {}
    with open(path_to_model_file, "rb") as fid:
        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_reg_images):
            binary_image_properties = read_next_bytes(
                fid, num_bytes=64, format_char_sequence="idddddddi")
            image_id = binary_image_properties[0]
            qvec = np.array(binary_image_properties[1:5])
            tvec = np.array(binary_image_properties[5:8])
            camera_id = binary_image_properties[8]
            image_name = ""
            current_char = read_next_bytes(fid, 1, "c")[0]
            while current_char != b"\x00":   # look for the ASCII 0 entry
                image_name += current_char.decode("utf-8")
                current_char = read_next_bytes(fid, 1, "c")[0]
            num_points2D = read_next_bytes(fid, num_bytes=8,
                                           format_char_sequence="Q")[0]
            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
                                       format_char_sequence="ddq"*num_points2D)
            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
                                   tuple(map(float, x_y_id_s[1::3]))])
            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
            images[image_id] = Image(
                id=image_id, qvec=qvec, tvec=tvec,
                camera_id=camera_id, name=image_name,
                xys=xys, point3D_ids=point3D_ids)
    return images


def read_intrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    cameras = {}
    with open(path_to_model_file, "rb") as fid:
        num_cameras = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_cameras):
            camera_properties = read_next_bytes(
                fid, num_bytes=24, format_char_sequence="iiQQ")
            camera_id = camera_properties[0]
            model_id = camera_properties[1]
            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
            width = camera_properties[2]
            height = camera_properties[3]
            num_params = CAMERA_MODEL_IDS[model_id].num_params
            params = read_next_bytes(fid, num_bytes=8*num_params,
                                     format_char_sequence="d"*num_params)
            cameras[camera_id] = Camera(id=camera_id,
                                        model=model_name,
                                        width=width,
                                        height=height,
                                        params=np.array(params))
        assert len(cameras) == num_cameras
    return cameras


def read_extrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    images = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                image_id = int(elems[0])
                qvec = np.array(tuple(map(float, elems[1:5])))
                tvec = np.array(tuple(map(float, elems[5:8])))
                camera_id = int(elems[8])
                image_name = elems[9]
                elems = fid.readline().split()
                xys = np.column_stack([tuple(map(float, elems[0::3])),
                                       tuple(map(float, elems[1::3]))])
                point3D_ids = np.array(tuple(map(int, elems[2::3])))
                images[image_id] = Image(
                    id=image_id, qvec=qvec, tvec=tvec,
                    camera_id=camera_id, name=image_name,
                    xys=xys, point3D_ids=point3D_ids)
    return images


def read_colmap_bin_array(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py

    :param path: path to the colmap binary file.
    :return: nd array with the floating point values in the value
    """
    with open(path, "rb") as fid:
        width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1,
                                                usecols=(0, 1, 2), dtype=int)
        fid.seek(0)
        num_delimiter = 0
        byte = fid.read(1)
        while True:
            if byte == b"&":
                num_delimiter += 1
                if num_delimiter >= 3:
                    break
            byte = fid.read(1)
        array = np.fromfile(fid, np.float32)
    array = array.reshape((width, height, channels), order="F")
    return np.transpose(array, (1, 0, 2)).squeeze()


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/dataset_readers.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import os
import sys
from PIL import Image
from typing import NamedTuple
from motionrep.gaussian_3d.scene.colmap_loader import (
    read_extrinsics_text,
    read_intrinsics_text,
    qvec2rotmat,
    read_extrinsics_binary,
    read_intrinsics_binary,
    read_points3D_binary,
    read_points3D_text,
)
from motionrep.gaussian_3d.utils.graphics_utils import (
    getWorld2View2,
    focal2fov,
    fov2focal,
)
import numpy as np
import math
import json
from pathlib import Path
from plyfile import PlyData, PlyElement
from motionrep.gaussian_3d.utils.sh_utils import SH2RGB
from motionrep.gaussian_3d.scene.gaussian_model import BasicPointCloud
import torch
import torch.nn as nn
from motionrep.gaussian_3d.utils.graphics_utils import (
    getWorld2View2,
    getProjectionMatrix,
)


class CameraInfo(NamedTuple):
    uid: int
    R: np.array
    T: np.array
    FovY: np.array
    FovX: np.array
    image: np.array
    image_path: str
    image_name: str
    width: int
    height: int


class SceneInfo(NamedTuple):
    point_cloud: BasicPointCloud
    train_cameras: list
    test_cameras: list
    nerf_normalization: dict
    ply_path: str


def getNerfppNorm(cam_info):
    def get_center_and_diag(cam_centers):
        cam_centers = np.hstack(cam_centers)
        avg_cam_center = np.mean(cam_centers, axis=1, keepdims=True)
        center = avg_cam_center
        dist = np.linalg.norm(cam_centers - center, axis=0, keepdims=True)
        diagonal = np.max(dist)
        return center.flatten(), diagonal

    cam_centers = []

    for cam in cam_info:
        W2C = getWorld2View2(cam.R, cam.T)
        C2W = np.linalg.inv(W2C)
        cam_centers.append(C2W[:3, 3:4])

    center, diagonal = get_center_and_diag(cam_centers)
    radius = diagonal * 1.1

    translate = -center

    return {"translate": translate, "radius": radius}


def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
    cam_infos = []
    for idx, key in enumerate(cam_extrinsics):
        sys.stdout.write("\r")
        # the exact output you're looking for:
        sys.stdout.write("Reading camera {}/{}".format(idx + 1, len(cam_extrinsics)))
        sys.stdout.flush()

        extr = cam_extrinsics[key]
        intr = cam_intrinsics[extr.camera_id]
        height = intr.height
        width = intr.width

        uid = intr.id
        R = np.transpose(qvec2rotmat(extr.qvec))
        T = np.array(extr.tvec)

        if intr.model == "SIMPLE_PINHOLE":
            focal_length_x = intr.params[0]
            FovY = focal2fov(focal_length_x, height)
            FovX = focal2fov(focal_length_x, width)
        elif intr.model == "PINHOLE":
            focal_length_x = intr.params[0]
            focal_length_y = intr.params[1]
            FovY = focal2fov(focal_length_y, height)
            FovX = focal2fov(focal_length_x, width)
        else:
            assert (
                False
            ), "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!"

        image_path = os.path.join(images_folder, os.path.basename(extr.name))
        image_name = os.path.basename(image_path).split(".")[0]
        image = Image.open(image_path)

        cam_info = CameraInfo(
            uid=uid,
            R=R,
            T=T,
            FovY=FovY,
            FovX=FovX,
            image=image,
            image_path=image_path,
            image_name=image_name,
            width=width,
            height=height,
        )
        cam_infos.append(cam_info)
    sys.stdout.write("\n")
    return cam_infos


def fetchPly(path):
    plydata = PlyData.read(path)
    vertices = plydata["vertex"]
    positions = np.vstack([vertices["x"], vertices["y"], vertices["z"]]).T
    colors = np.vstack([vertices["red"], vertices["green"], vertices["blue"]]).T / 255.0
    normals = np.vstack([vertices["nx"], vertices["ny"], vertices["nz"]]).T
    return BasicPointCloud(points=positions, colors=colors, normals=normals)


def storePly(path, xyz, rgb):
    # Define the dtype for the structured array
    dtype = [
        ("x", "f4"),
        ("y", "f4"),
        ("z", "f4"),
        ("nx", "f4"),
        ("ny", "f4"),
        ("nz", "f4"),
        ("red", "u1"),
        ("green", "u1"),
        ("blue", "u1"),
    ]

    normals = np.zeros_like(xyz)

    elements = np.empty(xyz.shape[0], dtype=dtype)
    attributes = np.concatenate((xyz, normals, rgb), axis=1)
    elements[:] = list(map(tuple, attributes))

    # Create the PlyData object and write to file
    vertex_element = PlyElement.describe(elements, "vertex")
    ply_data = PlyData([vertex_element])
    ply_data.write(path)


def readColmapSceneInfo(path, images, eval, llffhold=8):
    try:
        cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin")
        cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.bin")
        cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file)
        cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file)
    except:
        cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.txt")
        cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.txt")
        cam_extrinsics = read_extrinsics_text(cameras_extrinsic_file)
        cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file)

    reading_dir = "images" if images == None else images
    cam_infos_unsorted = readColmapCameras(
        cam_extrinsics=cam_extrinsics,
        cam_intrinsics=cam_intrinsics,
        images_folder=os.path.join(path, reading_dir),
    )
    cam_infos = sorted(cam_infos_unsorted.copy(), key=lambda x: x.image_name)

    if eval:
        train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0]
        test_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold == 0]
    else:
        train_cam_infos = cam_infos
        test_cam_infos = []

    nerf_normalization = getNerfppNorm(train_cam_infos)

    ply_path = os.path.join(path, "sparse/0/points3D.ply")
    bin_path = os.path.join(path, "sparse/0/points3D.bin")
    txt_path = os.path.join(path, "sparse/0/points3D.txt")
    if not os.path.exists(ply_path):
        print(
            "Converting point3d.bin to .ply, will happen only the first time you open the scene."
        )
        try:
            xyz, rgb, _ = read_points3D_binary(bin_path)
        except:
            xyz, rgb, _ = read_points3D_text(txt_path)
        storePly(ply_path, xyz, rgb)
    try:
        pcd = fetchPly(ply_path)
    except:
        pcd = None

    scene_info = SceneInfo(
        point_cloud=pcd,
        train_cameras=train_cam_infos,
        test_cameras=test_cam_infos,
        nerf_normalization=nerf_normalization,
        ply_path=ply_path,
    )
    return scene_info


def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png"):
    cam_infos = []

    with open(os.path.join(path, transformsfile)) as json_file:
        contents = json.load(json_file)

        # camera_angle_x is the horizontal field of view
        # frames.file_path is the image name
        # frame.transform_matrix is the camera-to-world transform

        fovx = contents["camera_angle_x"]

        frames = contents["frames"]
        for idx, frame in enumerate(frames):
            cam_name = os.path.join(path, frame["file_path"] + extension)

            # NeRF 'transform_matrix' is a camera-to-world transform
            c2w = np.array(frame["transform_matrix"])
            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            c2w[:3, 1:3] *= -1

            # get the world-to-camera transform and set R, T
            w2c = np.linalg.inv(c2w)
            R = np.transpose(
                w2c[:3, :3]
            )  # R is stored transposed due to 'glm' in CUDA code
            T = w2c[:3, 3]

            image_path = os.path.join(path, cam_name)
            image_name = Path(cam_name).stem
            image = Image.open(image_path)

            im_data = np.array(image.convert("RGBA"))

            bg = np.array([1, 1, 1]) if white_background else np.array([0, 0, 0])

            norm_data = im_data / 255.0
            arr = norm_data[:, :, :3] * norm_data[:, :, 3:4] + bg * (
                1 - norm_data[:, :, 3:4]
            )
            image = Image.fromarray(np.array(arr * 255.0, dtype=np.byte), "RGB")

            fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1])
            FovY = fovy
            FovX = fovx

            cam_infos.append(
                CameraInfo(
                    uid=idx,
                    R=R,
                    T=T,
                    FovY=FovY,
                    FovX=FovX,
                    image=image,
                    image_path=image_path,
                    image_name=image_name,
                    width=image.size[0],
                    height=image.size[1],
                )
            )

    return cam_infos


def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
    print("Reading Training Transforms")
    train_cam_infos = readCamerasFromTransforms(
        path, "transforms_train.json", white_background, extension
    )
    print("Reading Test Transforms")
    test_cam_infos = readCamerasFromTransforms(
        path, "transforms_test.json", white_background, extension
    )

    if not eval:
        train_cam_infos.extend(test_cam_infos)
        test_cam_infos = []

    nerf_normalization = getNerfppNorm(train_cam_infos)

    ply_path = os.path.join(path, "points3d.ply")
    if not os.path.exists(ply_path):
        # Since this data set has no colmap data, we start with random points
        num_pts = 100_000
        print(f"Generating random point cloud ({num_pts})...")

        # We create random points inside the bounds of the synthetic Blender scenes
        xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
        shs = np.random.random((num_pts, 3)) / 255.0
        pcd = BasicPointCloud(
            points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))
        )

        storePly(ply_path, xyz, SH2RGB(shs) * 255)
    try:
        pcd = fetchPly(ply_path)
    except:
        pcd = None

    scene_info = SceneInfo(
        point_cloud=pcd,
        train_cameras=train_cam_infos,
        test_cameras=test_cam_infos,
        nerf_normalization=nerf_normalization,
        ply_path=ply_path,
    )
    return scene_info


sceneLoadTypeCallbacks = {
    "Colmap": readColmapSceneInfo,
    "Blender": readNerfSyntheticInfo,
}


# below used for easy rendering
class NoImageCamera(nn.Module):
    def __init__(
        self,
        colmap_id,
        R,
        T,
        FoVx,
        FoVy,
        width,
        height,
        uid,
        trans=np.array([0.0, 0.0, 0.0]),
        scale=1.0,
        data_device="cuda",
        img_path=None, # not needed
    ):
        super(NoImageCamera, self).__init__()

        self.uid = uid
        self.colmap_id = colmap_id
        self.R = R
        self.T = T
        self.FoVx = FoVx
        self.FoVy = FoVy
        self.img_path = img_path 
        
        try:
            self.data_device = torch.device(data_device)
        except Exception as e:
            print(e)
            print(
                f"[Warning] Custom device {data_device} failed, fallback to default cuda device"
            )
            self.data_device = torch.device("cuda")

        self.image_width = width
        self.image_height = height

        self.zfar = 100.0
        self.znear = 0.01

        self.trans = trans
        self.scale = scale

        # world to camera, then transpose.  # [4, 4]
        #  w2c.transpose
        self.world_view_transform = (
            torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda()
        )

        # [4, 4]  
        self.projection_matrix = (
            getProjectionMatrix(
                znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy
            )
            .transpose(0, 1)
            .cuda()
        )

        # # [4, 4].  points @ full_proj_transform => screen space. 
        self.full_proj_transform = (
            self.world_view_transform.unsqueeze(0).bmm(
                self.projection_matrix.unsqueeze(0)
            )
        ).squeeze(0)
        self.camera_center = self.world_view_transform.inverse()[3, :3]

        # [2, 2].  
        #  (w2c @ p) / depth => cam_plane
        #  (p_in_cam / depth)[:2] @  cam_plane_2_img => [pixel_x, pixel_y]    cam_plane => img_plane 
        self.cam_plane_2_img = torch.tensor(
            [[ 0.5 * width / math.tan(self.FoVx / 2.0), 0.0], 
             [0.0, 0.5 * height / math.tan(self.FoVy / 2.0)]]
        ).cuda()


def fast_read_cameras_from_transform_file(file_path, width=1080, height=720):
    cam_infos = []  

    dir_name = os.path.dirname(file_path)

    with open(file_path) as json_file:
        contents = json.load(json_file)

        # camera_angle_x is the horizontal field of view
        # frames.file_path is the image name
        # frame.transform_matrix is the camera-to-world transform

        fovx = contents["camera_angle_x"]

        frames = contents["frames"]
        for idx, frame in enumerate(frames):
            # NeRF 'transform_matrix' is a camera-to-world transform
            c2w = np.array(frame["transform_matrix"])
            # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward)
            c2w[:3, 1:3] *= -1

            # get the world-to-camera transform and set R, T
            w2c = np.linalg.inv(c2w)
            R = np.transpose(
                w2c[:3, :3]
            )  # R is stored transposed due to 'glm' in CUDA code
            T = w2c[:3, 3]

            fovy = focal2fov(fov2focal(fovx, width), height)
            FovY = fovy
            FovX = fovx

            img_path = os.path.join(dir_name, frame["file_path"] + ".png")
            cam_ = NoImageCamera(
                colmap_id=idx,
                R=R,
                T=T,
                FoVx=FovX,
                FoVy=FovY,
                width=width,
                height=height,
                uid=id,
                data_device="cuda",
                img_path=img_path,
            )

            cam_infos.append(cam_)

    return cam_infos


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/gaussian_model.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import numpy as np
from motionrep.gaussian_3d.utils.general_utils import (
    inverse_sigmoid,
    get_expon_lr_func,
    build_rotation,
)
from torch import nn
import os
from motionrep.gaussian_3d.utils.system_utils import mkdir_p
from plyfile import PlyData, PlyElement
from motionrep.gaussian_3d.utils.sh_utils import RGB2SH
from simple_knn._C import distCUDA2
from motionrep.gaussian_3d.utils.graphics_utils import BasicPointCloud
from motionrep.gaussian_3d.utils.general_utils import (
    strip_symmetric,
    build_scaling_rotation,
)
from motionrep.gaussian_3d.utils.rigid_body_utils import (
    get_rigid_transform,
    matrix_to_quaternion,
    quaternion_multiply,
)


class GaussianModel:
    def setup_functions(self):
        def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation):
            L = build_scaling_rotation(scaling_modifier * scaling, rotation)
            actual_covariance = L @ L.transpose(1, 2)
            symm = strip_symmetric(actual_covariance)
            return symm

        self.scaling_activation = torch.exp
        self.scaling_inverse_activation = torch.log

        self.covariance_activation = build_covariance_from_scaling_rotation

        self.opacity_activation = torch.sigmoid
        self.inverse_opacity_activation = inverse_sigmoid

        self.rotation_activation = torch.nn.functional.normalize

    def __init__(self, sh_degree: int = 3):
        self.active_sh_degree = 0
        self.max_sh_degree = sh_degree
        self._xyz = torch.empty(0)
        self._features_dc = torch.empty(0)
        self._features_rest = torch.empty(0)
        self._scaling = torch.empty(0)
        self._rotation = torch.empty(0)
        self._opacity = torch.empty(0)
        self.max_radii2D = torch.empty(0)
        self.xyz_gradient_accum = torch.empty(0)
        self.denom = torch.empty(0)
        self.optimizer = None
        self.percent_dense = 0
        self.spatial_lr_scale = 0
        self.setup_functions()

        self.matched_inds = None

    def capture(self):
        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        return (
            self.active_sh_degree,
            self._xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

    def restore(self, model_args, training_args):
        (
            self.active_sh_degree,
            self._xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            xyz_gradient_accum,
            denom,
            opt_dict,
            self.spatial_lr_scale,
        ) = model_args

        if training_args is not None:
            self.training_setup(training_args)
        self.xyz_gradient_accum = xyz_gradient_accum
        self.denom = denom
        if opt_dict is not None:
            self.optimizer.load_state_dict(opt_dict)

    def capture_training_args(
        self,
    ):
        pass

    @property
    def get_scaling(self):
        return self.scaling_activation(self._scaling)

    @property
    def get_rotation(self):
        return self.rotation_activation(self._rotation)

    @property
    def get_xyz(self):
        return self._xyz

    @property
    def get_features(self):
        features_dc = self._features_dc
        features_rest = self._features_rest
        return torch.cat((features_dc, features_rest), dim=1)

    @property
    def get_opacity(self):
        return self.opacity_activation(self._opacity)

    def get_covariance(self, scaling_modifier=1):
        return self.covariance_activation(
            self.get_scaling, scaling_modifier, self._rotation
        )

    def oneupSHdegree(self):
        if self.active_sh_degree < self.max_sh_degree:
            self.active_sh_degree += 1

    def create_from_pcd(self, pcd: BasicPointCloud, spatial_lr_scale: float):
        self.spatial_lr_scale = spatial_lr_scale
        fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda()
        fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda())
        features = (
            torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2))
            .float()
            .cuda()
        )
        features[:, :3, 0] = fused_color
        # typo here?
        features[:, 3:, 1:] = 0.0

        print("Number of points at initialisation : ", fused_point_cloud.shape[0])

        dist2 = torch.clamp_min(
            distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()),
            0.0000001,
        )
        scales = torch.log(torch.sqrt(dist2))[..., None].repeat(1, 3)
        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
        rots[:, 0] = 1

        opacities = inverse_sigmoid(
            0.1
            * torch.ones(
                (fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"
            )
        )

        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
        self._features_dc = nn.Parameter(
            features[:, :, 0:1].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._features_rest = nn.Parameter(
            features[:, :, 1:].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._scaling = nn.Parameter(scales.requires_grad_(True))
        self._rotation = nn.Parameter(rots.requires_grad_(True))
        self._opacity = nn.Parameter(opacities.requires_grad_(True))
        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")

    def training_setup(self, training_args):
        self.percent_dense = training_args.percent_dense
        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")

        l = [
            {
                "params": [self._xyz],
                "lr": training_args.position_lr_init * self.spatial_lr_scale,
                "name": "xyz",
            },
            {
                "params": [self._features_dc],
                "lr": training_args.feature_lr,
                "name": "f_dc",
            },
            {
                "params": [self._features_rest],
                "lr": training_args.feature_lr / 20.0,
                "name": "f_rest",
            },
            {
                "params": [self._opacity],
                "lr": training_args.opacity_lr,
                "name": "opacity",
            },
            {
                "params": [self._scaling],
                "lr": training_args.scaling_lr,
                "name": "scaling",
            },
            {
                "params": [self._rotation],
                "lr": training_args.rotation_lr,
                "name": "rotation",
            },
        ]

        self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
        self.xyz_scheduler_args = get_expon_lr_func(
            lr_init=training_args.position_lr_init * self.spatial_lr_scale,
            lr_final=training_args.position_lr_final * self.spatial_lr_scale,
            lr_delay_mult=training_args.position_lr_delay_mult,
            max_steps=training_args.position_lr_max_steps,
        )

    def update_learning_rate(self, iteration):
        """Learning rate scheduling per step"""
        for param_group in self.optimizer.param_groups:
            if param_group["name"] == "xyz":
                lr = self.xyz_scheduler_args(iteration)
                param_group["lr"] = lr
                return lr

    def construct_list_of_attributes(self):
        l = ["x", "y", "z", "nx", "ny", "nz"]
        # All channels except the 3 DC
        for i in range(self._features_dc.shape[1] * self._features_dc.shape[2]):
            l.append("f_dc_{}".format(i))
        for i in range(self._features_rest.shape[1] * self._features_rest.shape[2]):
            l.append("f_rest_{}".format(i))
        l.append("opacity")
        for i in range(self._scaling.shape[1]):
            l.append("scale_{}".format(i))
        for i in range(self._rotation.shape[1]):
            l.append("rot_{}".format(i))
        return l

    def save_ply(self, path):
        mkdir_p(os.path.dirname(path))

        xyz = self._xyz.detach().cpu().numpy()
        normals = np.zeros_like(xyz)
        f_dc = (
            self._features_dc.detach()
            .transpose(1, 2)
            .flatten(start_dim=1)
            .contiguous()
            .cpu()
            .numpy()
        )
        f_rest = (
            self._features_rest.detach()
            .transpose(1, 2)
            .flatten(start_dim=1)
            .contiguous()
            .cpu()
            .numpy()
        )
        opacities = self._opacity.detach().cpu().numpy()
        scale = self._scaling.detach().cpu().numpy()
        rotation = self._rotation.detach().cpu().numpy()

        dtype_full = [
            (attribute, "f4") for attribute in self.construct_list_of_attributes()
        ]

        elements = np.empty(xyz.shape[0], dtype=dtype_full)
        attributes = np.concatenate(
            (xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1
        )
        elements[:] = list(map(tuple, attributes))
        el = PlyElement.describe(elements, "vertex")
        PlyData([el]).write(path)

    def reset_opacity(self):
        opacities_new = inverse_sigmoid(
            torch.min(self.get_opacity, torch.ones_like(self.get_opacity) * 0.01)
        )
        optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
        self._opacity = optimizable_tensors["opacity"]

    def load_ply(self, path):
        plydata = PlyData.read(path)

        xyz = np.stack(
            (
                np.asarray(plydata.elements[0]["x"]),
                np.asarray(plydata.elements[0]["y"]),
                np.asarray(plydata.elements[0]["z"]),
            ),
            axis=1,
        )
        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]

        features_dc = np.zeros((xyz.shape[0], 3, 1))
        features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
        features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"])
        features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"])

        extra_f_names = [
            p.name
            for p in plydata.elements[0].properties
            if p.name.startswith("f_rest_")
        ]
        extra_f_names = sorted(extra_f_names, key=lambda x: int(x.split("_")[-1]))
        assert len(extra_f_names) == 3 * (self.max_sh_degree + 1) ** 2 - 3
        features_extra = np.zeros((xyz.shape[0], len(extra_f_names)))
        for idx, attr_name in enumerate(extra_f_names):
            features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name])
        # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC)
        features_extra = features_extra.reshape(
            (features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)
        )

        scale_names = [
            p.name
            for p in plydata.elements[0].properties
            if p.name.startswith("scale_")
        ]
        scale_names = sorted(scale_names, key=lambda x: int(x.split("_")[-1]))
        scales = np.zeros((xyz.shape[0], len(scale_names)))
        for idx, attr_name in enumerate(scale_names):
            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])

        rot_names = [
            p.name for p in plydata.elements[0].properties if p.name.startswith("rot")
        ]
        rot_names = sorted(rot_names, key=lambda x: int(x.split("_")[-1]))
        rots = np.zeros((xyz.shape[0], len(rot_names)))
        for idx, attr_name in enumerate(rot_names):
            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])

        self._xyz = nn.Parameter(
            torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)
        )
        self._features_dc = nn.Parameter(
            torch.tensor(features_dc, dtype=torch.float, device="cuda")
            .transpose(1, 2)
            .contiguous()
            .requires_grad_(True)
        )
        self._features_rest = nn.Parameter(
            torch.tensor(features_extra, dtype=torch.float, device="cuda")
            .transpose(1, 2)
            .contiguous()
            .requires_grad_(True)
        )
        self._opacity = nn.Parameter(
            torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(
                True
            )
        )
        self._scaling = nn.Parameter(
            torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)
        )
        self._rotation = nn.Parameter(
            torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)
        )

        self.active_sh_degree = self.max_sh_degree

    def replace_tensor_to_optimizer(self, tensor, name):
        optimizable_tensors = {}
        for group in self.optimizer.param_groups:
            if group["name"] == name:
                stored_state = self.optimizer.state.get(group["params"][0], None)
                stored_state["exp_avg"] = torch.zeros_like(tensor)
                stored_state["exp_avg_sq"] = torch.zeros_like(tensor)

                del self.optimizer.state[group["params"][0]]
                group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
                self.optimizer.state[group["params"][0]] = stored_state

                optimizable_tensors[group["name"]] = group["params"][0]
        return optimizable_tensors

    def _prune_optimizer(self, mask):
        optimizable_tensors = {}
        for group in self.optimizer.param_groups:
            stored_state = self.optimizer.state.get(group["params"][0], None)
            if stored_state is not None:
                stored_state["exp_avg"] = stored_state["exp_avg"][mask]
                stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]

                del self.optimizer.state[group["params"][0]]
                group["params"][0] = nn.Parameter(
                    (group["params"][0][mask].requires_grad_(True))
                )
                self.optimizer.state[group["params"][0]] = stored_state

                optimizable_tensors[group["name"]] = group["params"][0]
            else:
                group["params"][0] = nn.Parameter(
                    group["params"][0][mask].requires_grad_(True)
                )
                optimizable_tensors[group["name"]] = group["params"][0]
        return optimizable_tensors

    def prune_points(self, mask):
        valid_points_mask = ~mask
        optimizable_tensors = self._prune_optimizer(valid_points_mask)

        self._xyz = optimizable_tensors["xyz"]
        self._features_dc = optimizable_tensors["f_dc"]
        self._features_rest = optimizable_tensors["f_rest"]
        self._opacity = optimizable_tensors["opacity"]
        self._scaling = optimizable_tensors["scaling"]
        self._rotation = optimizable_tensors["rotation"]

        self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]

        self.denom = self.denom[valid_points_mask]
        self.max_radii2D = self.max_radii2D[valid_points_mask]

    def cat_tensors_to_optimizer(self, tensors_dict):
        optimizable_tensors = {}
        for group in self.optimizer.param_groups:
            assert len(group["params"]) == 1
            extension_tensor = tensors_dict[group["name"]]
            stored_state = self.optimizer.state.get(group["params"][0], None)
            if stored_state is not None:
                stored_state["exp_avg"] = torch.cat(
                    (stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0
                )
                stored_state["exp_avg_sq"] = torch.cat(
                    (stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)),
                    dim=0,
                )

                del self.optimizer.state[group["params"][0]]
                group["params"][0] = nn.Parameter(
                    torch.cat(
                        (group["params"][0], extension_tensor), dim=0
                    ).requires_grad_(True)
                )
                self.optimizer.state[group["params"][0]] = stored_state

                optimizable_tensors[group["name"]] = group["params"][0]
            else:
                group["params"][0] = nn.Parameter(
                    torch.cat(
                        (group["params"][0], extension_tensor), dim=0
                    ).requires_grad_(True)
                )
                optimizable_tensors[group["name"]] = group["params"][0]

        return optimizable_tensors

    def densification_postfix(
        self,
        new_xyz,
        new_features_dc,
        new_features_rest,
        new_opacities,
        new_scaling,
        new_rotation,
    ):
        d = {
            "xyz": new_xyz,
            "f_dc": new_features_dc,
            "f_rest": new_features_rest,
            "opacity": new_opacities,
            "scaling": new_scaling,
            "rotation": new_rotation,
        }

        optimizable_tensors = self.cat_tensors_to_optimizer(d)
        self._xyz = optimizable_tensors["xyz"]
        self._features_dc = optimizable_tensors["f_dc"]
        self._features_rest = optimizable_tensors["f_rest"]
        self._opacity = optimizable_tensors["opacity"]
        self._scaling = optimizable_tensors["scaling"]
        self._rotation = optimizable_tensors["rotation"]

        self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
        self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")

    def densify_and_split(self, grads, grad_threshold, scene_extent, N=2):
        n_init_points = self.get_xyz.shape[0]
        # Extract points that satisfy the gradient condition
        padded_grad = torch.zeros((n_init_points), device="cuda")
        padded_grad[: grads.shape[0]] = grads.squeeze()
        selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False)
        selected_pts_mask = torch.logical_and(
            selected_pts_mask,
            torch.max(self.get_scaling, dim=1).values
            > self.percent_dense * scene_extent,
        )

        stds = self.get_scaling[selected_pts_mask].repeat(N, 1)
        means = torch.zeros((stds.size(0), 3), device="cuda")
        samples = torch.normal(mean=means, std=stds)
        rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N, 1, 1)
        new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[
            selected_pts_mask
        ].repeat(N, 1)
        new_scaling = self.scaling_inverse_activation(
            self.get_scaling[selected_pts_mask].repeat(N, 1) / (0.8 * N)
        )
        new_rotation = self._rotation[selected_pts_mask].repeat(N, 1)
        new_features_dc = self._features_dc[selected_pts_mask].repeat(N, 1, 1)
        new_features_rest = self._features_rest[selected_pts_mask].repeat(N, 1, 1)
        new_opacity = self._opacity[selected_pts_mask].repeat(N, 1)

        self.densification_postfix(
            new_xyz,
            new_features_dc,
            new_features_rest,
            new_opacity,
            new_scaling,
            new_rotation,
        )

        prune_filter = torch.cat(
            (
                selected_pts_mask,
                torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool),
            )
        )
        self.prune_points(prune_filter)

    def densify_and_clone(self, grads, grad_threshold, scene_extent):
        # Extract points that satisfy the gradient condition
        selected_pts_mask = torch.where(
            torch.norm(grads, dim=-1) >= grad_threshold, True, False
        )
        selected_pts_mask = torch.logical_and(
            selected_pts_mask,
            torch.max(self.get_scaling, dim=1).values
            <= self.percent_dense * scene_extent,
        )

        new_xyz = self._xyz[selected_pts_mask]
        new_features_dc = self._features_dc[selected_pts_mask]
        new_features_rest = self._features_rest[selected_pts_mask]
        new_opacities = self._opacity[selected_pts_mask]
        new_scaling = self._scaling[selected_pts_mask]
        new_rotation = self._rotation[selected_pts_mask]

        self.densification_postfix(
            new_xyz,
            new_features_dc,
            new_features_rest,
            new_opacities,
            new_scaling,
            new_rotation,
        )

    def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size):
        grads = self.xyz_gradient_accum / self.denom
        grads[grads.isnan()] = 0.0

        self.densify_and_clone(grads, max_grad, extent)
        self.densify_and_split(grads, max_grad, extent)

        prune_mask = (self.get_opacity < min_opacity).squeeze()
        if max_screen_size:
            big_points_vs = self.max_radii2D > max_screen_size
            big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
            prune_mask = torch.logical_or(
                torch.logical_or(prune_mask, big_points_vs), big_points_ws
            )
        self.prune_points(prune_mask)

        torch.cuda.empty_cache()

    def add_densification_stats(self, viewspace_point_tensor, update_filter):
        self.xyz_gradient_accum[update_filter] += torch.norm(
            viewspace_point_tensor.grad[update_filter, :2], dim=-1, keepdim=True
        )
        self.denom[update_filter] += 1

    def apply_discrete_offset_filds(self, origin_points, offsets):
        """
        Args:
            origin_points: (N_r, 3)
            offsets: (N_r, 3)
        """

        # since origin points and self._xyz might not be matched, we need to first
        #   compute the distance between origin points and self._xyz
        #   then find the nearest point in self._xyz for each origin point

        # compute the distance between origin points and self._xyz
        # [N_r, num_points]
        dist = torch.cdist(origin_points, self._xyz)
        # find the nearest point in self._xyz for each origin point
        _, idx = torch.min(dist, dim=0)

        # apply offsets

        new_xyz = self._xyz + offsets[idx]

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_discrete_offset_filds_with_R(self, origin_points, offsets, topk=6):
        """
        Args:
            origin_points: (N_r, 3)
            offsets: (N_r, 3)
        """

        # since origin points and self._xyz might not be matched, we need to first
        #   compute the distance between origin points and self._xyz
        #   then find the nearest point in self._xyz for each origin point

        if self.matched_inds is None:
            # compute the distance between origin points and self._xyz
            # [N_r, num_points]
            dist = torch.cdist(origin_points, self._xyz) * -1.0
            # find the nearest point in self._xyz for each origin point

            # idxs: [topk, num_points]
            print(dist.shape, topk, dist[0])
            _, idxs = torch.topk(dist, topk, dim=0)

            self.matched_inds = idxs
        else:
            idxs = self.matched_inds

        # [topk, num_points, 3] => [num_points, topk, 3]
        matched_topk_offsets = offsets[idxs].transpose(0, 1)
        source_points = origin_points[idxs].transpose(0, 1)

        # [num_points, 3, 3/1]
        R, t = get_rigid_transform(source_points, source_points + matched_topk_offsets)

        # new_xyz = R @ self._xyz.unsqueeze(dim=-1) + t
        # new_xyz = new_xyz.squeeze(dim=-1)

        avg_offsets = matched_topk_offsets.mean(dim=1)
        new_xyz = self._xyz + avg_offsets  # offset directly

        new_rotation = quaternion_multiply(matrix_to_quaternion(R), self._rotation)

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            new_rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_se3_fields(
        self,
        se3_model,
        timestamp: float,
        freeze_mask=None,
    ):
        """
        Args:
            se3_model: SE3Model
            timestamp: float.  in range [0, 1]
            freeze_mask: [N]
        """

        inp_time = torch.ones_like(self._xyz[:, 0:1]) * timestamp
        inp = torch.cat([self._xyz, inp_time], dim=-1)

        if freeze_mask is not None:
            moving_mask = torch.logical_not(freeze_mask)
            inp = inp[moving_mask, ...]
        # [bs, 3, 3]. [bs, 3]
        R, t = se3_model(inp)

        # print("abs t mean", torch.abs(t).mean(dim=0))
        # new_xyz = (R @ self._xyz.unsqueeze(dim=-1)).squeeze(dim=-1) + t

        if freeze_mask is None:
            new_xyz = self._xyz + t
            new_rotation = quaternion_multiply(matrix_to_quaternion(R), self._rotation)
        else:
            new_xyz = self._xyz.clone()
            new_xyz[moving_mask, ...] += t
            new_rotation = self._rotation.clone()
            new_rotation[moving_mask, ...] = quaternion_multiply(
                matrix_to_quaternion(R), self._rotation[moving_mask, ...]
            )

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            new_rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_offset_fields(self, offset_field, timestamp: float):
        """
        Args:
            se3_model: SE3Model
            timestamp: float.  in range [0, 1]
        """

        inp_time = torch.ones_like(self._xyz[:, 0:1]) * timestamp
        inp = torch.cat([self._xyz, inp_time], dim=-1)
        # [bs, 3, 3]. [bs, 3]
        offsets = offset_field(inp)

        # print("abs t mean", torch.abs(t).mean(dim=0))
        new_xyz = self._xyz + offsets

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            self._rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def apply_offset_fields_with_R(self, offset_field, timestamp: float, eps=1e-2):
        """
        Args:
            se3_model: SE3Model
            timestamp: float.  in range [0, 1]
        """

        # [4, 3]
        inp_perterb = (
            torch.tensor(
                [
                    [0.0, 0.0, 0.0],  # add this will coplanar?
                    [+eps, -eps, -eps],
                    [-eps, -eps, +eps],
                    [-eps, +eps, -eps],
                    [+eps, +eps, +eps],
                ],
            )
            .to(self._xyz.device)
            .float()
        )
        #  => [N, 4, 3]
        source_points = self._xyz.unsqueeze(dim=1) + inp_perterb.unsqueeze(dim=0)
        num_points = source_points.shape[0]

        inpx = source_points.flatten(end_dim=1)
        inp_time = torch.ones_like(inpx[:, 0:1]) * timestamp

        inp = torch.cat([inpx, inp_time], dim=-1)

        sampled_offsets = offset_field(inp).reshape((num_points, -1, 3))

        R, t = get_rigid_transform(source_points, source_points + sampled_offsets)

        # new_xyz = R @ self._xyz.unsqueeze(dim=-1) + t
        # new_xyz = new_xyz.squeeze(dim=-1)

        avg_offsets = sampled_offsets.mean(dim=1)
        new_xyz = self._xyz + avg_offsets  # offset directly

        new_rotation = quaternion_multiply(matrix_to_quaternion(R), self._rotation)

        if self.optimizer is None:
            optim_state = None
        else:
            optim_state = self.optimizer.state_dict()

        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc,
            self._features_rest,
            self._scaling,
            new_rotation,
            self._opacity,
            self.max_radii2D,
            self.xyz_gradient_accum,
            self.denom,
            optim_state,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    def init_from_mesh(
        self,
        mesh_path: str,
        num_gaussians: int = 10000,
    ):
        import point_cloud_utils as pcu

        mesh = pcu.load_triangle_mesh(mesh_path)

        v, f = mesh.v, mesh.f

        v_n = pcu.estimate_mesh_normals(v, f)
        vert_colors = mesh.vertex_data.colors

        fid, bc = pcu.sample_mesh_random(v, f, num_gaussians)

        # Interpolate the vertex positions and normals using the returned barycentric coordinates
        # to get sample positions and normals
        rand_positions = pcu.interpolate_barycentric_coords(f, fid, bc, v)
        rand_normals = pcu.interpolate_barycentric_coords(f, fid, bc, v_n)
        rand_colors = pcu.interpolate_barycentric_coords(f, fid, bc, vert_colors)[:, :3]

        # copy original pointcloud init functions

        fused_point_cloud = torch.tensor(np.asarray(rand_positions)).float().cuda()
        fused_color = RGB2SH(torch.tensor(np.asarray(rand_colors)).float().cuda())
        features = (
            torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2))
            .float()
            .cuda()
        )
        features[:, :3, 0] = fused_color
        # typo here?
        features[:, 3:, 1:] = 0.0

        print("Number of points at initialisation : ", fused_point_cloud.shape[0])

        dist2 = torch.clamp_min(
            distCUDA2(torch.from_numpy(np.asarray(rand_positions)).float().cuda()),
            0.0000001,
        )
        scales = torch.log(torch.sqrt(dist2))[..., None].repeat(1, 3)
        rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda")
        rots[:, 0] = 1

        opacities = inverse_sigmoid(
            0.1
            * torch.ones(
                (fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda"
            )
        )

        self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True))
        self._features_dc = nn.Parameter(
            features[:, :, 0:1].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._features_rest = nn.Parameter(
            features[:, :, 1:].transpose(1, 2).contiguous().requires_grad_(True)
        )
        self._scaling = nn.Parameter(scales.requires_grad_(True))
        self._rotation = nn.Parameter(rots.requires_grad_(True))
        self._opacity = nn.Parameter(opacities.requires_grad_(True))
        self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")

    def detach_grad(
        self,
    ):
        self._xyz.requires_grad = False
        self._features_dc.requires_grad = False
        self._features_rest.requires_grad = False
        self._scaling.requires_grad = False
        self._rotation.requires_grad = False
        self._opacity.requires_grad = False

    def apply_mask(self, mask):
        new_xyz = self._xyz[mask]
        if self.xyz_gradient_accum.shape == self._xyz.shape:
            new_xyz_gradient_accum = self.xyz_gradient_accum[mask]
            new_denom = self.denom[mask]
        else:
            new_xyz_gradient_accum = self.xyz_gradient_accum
            new_denom = self.denom
        new_model_args = (
            self.active_sh_degree,
            new_xyz,
            self._features_dc[mask],
            self._features_rest[mask],
            self._scaling[mask],
            self._rotation[mask],
            self._opacity[mask],
            self.max_radii2D,
            new_xyz_gradient_accum,
            new_denom,
            None,
            self.spatial_lr_scale,
        )

        ret_gaussian = GaussianModel(self.max_sh_degree)
        ret_gaussian.restore(new_model_args, None)

        return ret_gaussian

    @torch.no_grad()
    def extract_fields(self, resolution=128, num_blocks=16, relax_ratio=1.5):
        # resolution: resolution of field

        block_size = 2 / num_blocks

        assert resolution % block_size == 0
        split_size = resolution // num_blocks

        opacities = self.get_opacity

        # pre-filter low opacity gaussians to save computation
        mask = (opacities > 0.005).squeeze(1)

        opacities = opacities[mask]
        xyzs = self.get_xyz[mask]
        stds = self.get_scaling[mask]

        # normalize to ~ [-1, 1]
        mn, mx = xyzs.amin(0), xyzs.amax(0)
        self.center = (mn + mx) / 2
        self.scale = 1.0 / (mx - mn).amax().item()

        print("gaussian center, scale", self.center, self.scale)
        xyzs = (xyzs - self.center) * self.scale
        stds = stds * self.scale

        covs = self.covariance_activation(stds, 1, self._rotation[mask])

        # tile
        device = opacities.device
        occ = torch.zeros([resolution] * 3, dtype=torch.float32, device=device)

        X = torch.linspace(-1, 1, resolution).split(split_size)
        Y = torch.linspace(-1, 1, resolution).split(split_size)
        Z = torch.linspace(-1, 1, resolution).split(split_size)

        # loop blocks (assume max size of gaussian is small than relax_ratio * block_size !!!)
        for xi, xs in enumerate(X):
            for yi, ys in enumerate(Y):
                for zi, zs in enumerate(Z):
                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
                    # sample points [M, 3]
                    pts = torch.cat(
                        [xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)],
                        dim=-1,
                    ).to(device)
                    # in-tile gaussians mask
                    vmin, vmax = pts.amin(0), pts.amax(0)
                    vmin -= block_size * relax_ratio
                    vmax += block_size * relax_ratio
                    mask = (xyzs < vmax).all(-1) & (xyzs > vmin).all(-1)
                    # if hit no gaussian, continue to next block
                    if not mask.any():
                        continue
                    mask_xyzs = xyzs[mask]  # [L, 3]
                    mask_covs = covs[mask]  # [L, 6]
                    mask_opas = opacities[mask].view(1, -1)  # [L, 1] --> [1, L]

                    # query per point-gaussian pair.
                    g_pts = pts.unsqueeze(1).repeat(
                        1, mask_covs.shape[0], 1
                    ) - mask_xyzs.unsqueeze(
                        0
                    )  # [M, L, 3]
                    g_covs = mask_covs.unsqueeze(0).repeat(
                        pts.shape[0], 1, 1
                    )  # [M, L, 6]

                    # batch on gaussian to avoid OOM
                    batch_g = 1024
                    val = 0
                    for start in range(0, g_covs.shape[1], batch_g):
                        end = min(start + batch_g, g_covs.shape[1])
                        w = gaussian_3d_coeff(
                            g_pts[:, start:end].reshape(-1, 3),
                            g_covs[:, start:end].reshape(-1, 6),
                        ).reshape(
                            pts.shape[0], -1
                        )  # [M, l]
                        val += (mask_opas[:, start:end] * w).sum(-1)

                    # kiui.lo(val, mask_opas, w)

                    occ[
                        xi * split_size : xi * split_size + len(xs),
                        yi * split_size : yi * split_size + len(ys),
                        zi * split_size : zi * split_size + len(zs),
                    ] = val.reshape(len(xs), len(ys), len(zs))

        return occ

    def extract_mesh(self, path, density_thresh=1, resolution=128, decimate_target=1e5):
        os.makedirs(os.path.dirname(path), exist_ok=True)

        from motionrep.gaussian_3d.scene.mesh import Mesh
        from motionrep.gaussian_3d.scene.mesh_utils import decimate_mesh, clean_mesh

        occ = self.extract_fields(resolution).detach().cpu().numpy()

        print(occ.shape, occ.min(), occ.max(), occ.mean(), "occ stats")
        print(np.percentile(occ, [0, 1, 5, 10, 50, 90, 95, 99, 100]), "occ percentiles")
        import mcubes

        vertices, triangles = mcubes.marching_cubes(occ, density_thresh)
        vertices = vertices / (resolution - 1.0) * 2 - 1

        # transform back to the original space
        vertices = vertices / self.scale + self.center.detach().cpu().numpy()

        vertices, triangles = clean_mesh(
            vertices, triangles, remesh=True, remesh_size=0.015
        )
        if decimate_target > 0 and triangles.shape[0] > decimate_target:
            vertices, triangles = decimate_mesh(vertices, triangles, decimate_target)

        v = torch.from_numpy(vertices.astype(np.float32)).contiguous().cuda()
        f = torch.from_numpy(triangles.astype(np.int32)).contiguous().cuda()

        print(
            f"[INFO] marching cubes result: {v.shape} ({v.min().item()}-{v.max().item()}), {f.shape}"
        )

        mesh = Mesh(v=v, f=f, device="cuda")

        return mesh


def gaussian_3d_coeff(xyzs, covs):
    # xyzs: [N, 3]
    # covs: [N, 6]
    x, y, z = xyzs[:, 0], xyzs[:, 1], xyzs[:, 2]
    a, b, c, d, e, f = (
        covs[:, 0],
        covs[:, 1],
        covs[:, 2],
        covs[:, 3],
        covs[:, 4],
        covs[:, 5],
    )

    # eps must be small enough !!!
    inv_det = 1 / (a * d * f + 2 * e * c * b - e**2 * a - c**2 * d - b**2 * f + 1e-24)
    inv_a = (d * f - e**2) * inv_det
    inv_b = (e * c - b * f) * inv_det
    inv_c = (e * b - c * d) * inv_det
    inv_d = (a * f - c**2) * inv_det
    inv_e = (b * c - e * a) * inv_det
    inv_f = (a * d - b**2) * inv_det

    power = (
        -0.5 * (x**2 * inv_a + y**2 * inv_d + z**2 * inv_f)
        - x * y * inv_b
        - x * z * inv_c
        - y * z * inv_e
    )

    power[power > 0] = -1e10  # abnormal values... make weights 0

    return torch.exp(power)


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/mesh.py
================================================
import os
import cv2
import torch
import trimesh
import numpy as np


def dot(x, y):
    return torch.sum(x * y, -1, keepdim=True)


def length(x, eps=1e-20):
    return torch.sqrt(torch.clamp(dot(x, x), min=eps))


def safe_normalize(x, eps=1e-20):
    return x / length(x, eps)


class Mesh:
    def __init__(
        self,
        v=None,
        f=None,
        vn=None,
        fn=None,
        vt=None,
        ft=None,
        albedo=None,
        vc=None,  # vertex color
        device=None,
    ):
        self.device = device
        self.v = v
        self.vn = vn
        self.vt = vt
        self.f = f
        self.fn = fn
        self.ft = ft
        # only support a single albedo
        self.albedo = albedo
        # support vertex color is no albedo
        self.vc = vc

        self.ori_center = 0
        self.ori_scale = 1

    @classmethod
    def load(
        cls,
        path=None,
        resize=True,
        renormal=True,
        retex=False,
        front_dir="+z",
        **kwargs,
    ):
        # assume init with kwargs
        if path is None:
            mesh = cls(**kwargs)
        # obj supports face uv
        elif path.endswith(".obj"):
            mesh = cls.load_obj(path, **kwargs)
        # trimesh only supports vertex uv, but can load more formats
        else:
            mesh = cls.load_trimesh(path, **kwargs)

        print(f"[Mesh loading] v: {mesh.v.shape}, f: {mesh.f.shape}")
        # auto-normalize
        if resize:
            mesh.auto_size()
        # auto-fix normal
        if renormal or mesh.vn is None:
            mesh.auto_normal()
            print(f"[Mesh loading] vn: {mesh.vn.shape}, fn: {mesh.fn.shape}")
        # auto-fix texcoords
        if retex or (mesh.albedo is not None and mesh.vt is None):
            mesh.auto_uv(cache_path=path)
            print(f"[Mesh loading] vt: {mesh.vt.shape}, ft: {mesh.ft.shape}")

        # rotate front dir to +z
        if front_dir != "+z":
            # axis switch
            if "-z" in front_dir:
                T = torch.tensor(
                    [[1, 0, 0], [0, 1, 0], [0, 0, -1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "+x" in front_dir:
                T = torch.tensor(
                    [[0, 0, 1], [0, 1, 0], [1, 0, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "-x" in front_dir:
                T = torch.tensor(
                    [[0, 0, -1], [0, 1, 0], [1, 0, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "+y" in front_dir:
                T = torch.tensor(
                    [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "-y" in front_dir:
                T = torch.tensor(
                    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            else:
                T = torch.tensor(
                    [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            # rotation (how many 90 degrees)
            if "1" in front_dir:
                T @= torch.tensor(
                    [[0, -1, 0], [1, 0, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "2" in front_dir:
                T @= torch.tensor(
                    [[1, 0, 0], [0, -1, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            elif "3" in front_dir:
                T @= torch.tensor(
                    [[0, 1, 0], [-1, 0, 0], [0, 0, 1]],
                    device=mesh.device,
                    dtype=torch.float32,
                )
            mesh.v @= T
            mesh.vn @= T

        return mesh

    # load from obj file
    @classmethod
    def load_obj(cls, path, albedo_path=None, device=None):
        assert os.path.splitext(path)[-1] == ".obj"

        mesh = cls()

        # device
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        mesh.device = device

        # load obj
        with open(path, "r") as f:
            lines = f.readlines()

        def parse_f_v(fv):
            # pass in a vertex term of a face, return {v, vt, vn} (-1 if not provided)
            # supported forms:
            # f v1 v2 v3
            # f v1/vt1 v2/vt2 v3/vt3
            # f v1/vt1/vn1 v2/vt2/vn2 v3/vt3/vn3
            # f v1//vn1 v2//vn2 v3//vn3
            xs = [int(x) - 1 if x != "" else -1 for x in fv.split("/")]
            xs.extend([-1] * (3 - len(xs)))
            return xs[0], xs[1], xs[2]

        # NOTE: we ignore usemtl, and assume the mesh ONLY uses one material (first in mtl)
        vertices, texcoords, normals = [], [], []
        faces, tfaces, nfaces = [], [], []
        mtl_path = None

        for line in lines:
            split_line = line.split()
            # empty line
            if len(split_line) == 0:
                continue
            prefix = split_line[0].lower()
            # mtllib
            if prefix == "mtllib":
                mtl_path = split_line[1]
            # usemtl
            elif prefix == "usemtl":
                pass  # ignored
            # v/vn/vt
            elif prefix == "v":
                vertices.append([float(v) for v in split_line[1:]])
            elif prefix == "vn":
                normals.append([float(v) for v in split_line[1:]])
            elif prefix == "vt":
                val = [float(v) for v in split_line[1:]]
                texcoords.append([val[0], 1.0 - val[1]])
            elif prefix == "f":
                vs = split_line[1:]
                nv = len(vs)
                v0, t0, n0 = parse_f_v(vs[0])
                for i in range(nv - 2):  # triangulate (assume vertices are ordered)
                    v1, t1, n1 = parse_f_v(vs[i + 1])
                    v2, t2, n2 = parse_f_v(vs[i + 2])
                    faces.append([v0, v1, v2])
                    tfaces.append([t0, t1, t2])
                    nfaces.append([n0, n1, n2])

        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
        mesh.vt = (
            torch.tensor(texcoords, dtype=torch.float32, device=device)
            if len(texcoords) > 0
            else None
        )
        mesh.vn = (
            torch.tensor(normals, dtype=torch.float32, device=device)
            if len(normals) > 0
            else None
        )

        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
        mesh.ft = (
            torch.tensor(tfaces, dtype=torch.int32, device=device)
            if len(texcoords) > 0
            else None
        )
        mesh.fn = (
            torch.tensor(nfaces, dtype=torch.int32, device=device)
            if len(normals) > 0
            else None
        )

        # see if there is vertex color
        use_vertex_color = False
        if mesh.v.shape[1] == 6:
            use_vertex_color = True
            mesh.vc = mesh.v[:, 3:]
            mesh.v = mesh.v[:, :3]
            print(f"[load_obj] use vertex color: {mesh.vc.shape}")

        # try to load texture image
        if not use_vertex_color:
            # try to retrieve mtl file
            mtl_path_candidates = []
            if mtl_path is not None:
                mtl_path_candidates.append(mtl_path)
                mtl_path_candidates.append(
                    os.path.join(os.path.dirname(path), mtl_path)
                )
            mtl_path_candidates.append(path.replace(".obj", ".mtl"))

            mtl_path = None
            for candidate in mtl_path_candidates:
                if os.path.exists(candidate):
                    mtl_path = candidate
                    break

            # if albedo_path is not provided, try retrieve it from mtl
            if mtl_path is not None and albedo_path is None:
                with open(mtl_path, "r") as f:
                    lines = f.readlines()
                for line in lines:
                    split_line = line.split()
                    # empty line
                    if len(split_line) == 0:
                        continue
                    prefix = split_line[0]
                    # NOTE: simply use the first map_Kd as albedo!
                    if "map_Kd" in prefix:
                        albedo_path = os.path.join(os.path.dirname(path), split_line[1])
                        print(f"[load_obj] use texture from: {albedo_path}")
                        break

            # still not found albedo_path, or the path doesn't exist
            if albedo_path is None or not os.path.exists(albedo_path):
                # init an empty texture
                print(f"[load_obj] init empty albedo!")
                # albedo = np.random.rand(1024, 1024, 3).astype(np.float32)
                albedo = np.ones((1024, 1024, 3), dtype=np.float32) * np.array(
                    [0.5, 0.5, 0.5]
                )  # default color
            else:
                albedo = cv2.imread(albedo_path, cv2.IMREAD_UNCHANGED)
                albedo = cv2.cvtColor(albedo, cv2.COLOR_BGR2RGB)
                albedo = albedo.astype(np.float32) / 255
                print(f"[load_obj] load texture: {albedo.shape}")

                # import matplotlib.pyplot as plt
                # plt.imshow(albedo)
                # plt.show()

            mesh.albedo = torch.tensor(albedo, dtype=torch.float32, device=device)

        return mesh

    @classmethod
    def load_trimesh(cls, path, device=None):
        mesh = cls()

        # device
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        mesh.device = device

        # use trimesh to load ply/glb, assume only has one single RootMesh...
        _data = trimesh.load(path)
        if isinstance(_data, trimesh.Scene):
            if len(_data.geometry) == 1:
                _mesh = list(_data.geometry.values())[0]
            else:
                # manual concat, will lose texture
                _concat = []
                for g in _data.geometry.values():
                    if isinstance(g, trimesh.Trimesh):
                        _concat.append(g)
                _mesh = trimesh.util.concatenate(_concat)
        else:
            _mesh = _data

        if _mesh.visual.kind == "vertex":
            vertex_colors = _mesh.visual.vertex_colors
            vertex_colors = np.array(vertex_colors[..., :3]).astype(np.float32) / 255
            mesh.vc = torch.tensor(vertex_colors, dtype=torch.float32, device=device)
            print(f"[load_trimesh] use vertex color: {mesh.vc.shape}")
        elif _mesh.visual.kind == "texture":
            _material = _mesh.visual.material
            if isinstance(_material, trimesh.visual.material.PBRMaterial):
                texture = np.array(_material.baseColorTexture).astype(np.float32) / 255
            elif isinstance(_material, trimesh.visual.material.SimpleMaterial):
                texture = (
                    np.array(_material.to_pbr().baseColorTexture).astype(np.float32)
                    / 255
                )
            else:
                raise NotImplementedError(
                    f"material type {type(_material)} not supported!"
                )
            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
            print(f"[load_trimesh] load texture: {texture.shape}")
        else:
            texture = np.ones((1024, 1024, 3), dtype=np.float32) * np.array(
                [0.5, 0.5, 0.5]
            )
            mesh.albedo = torch.tensor(texture, dtype=torch.float32, device=device)
            print(f"[load_trimesh] failed to load texture.")

        vertices = _mesh.vertices

        try:
            texcoords = _mesh.visual.uv
            texcoords[:, 1] = 1 - texcoords[:, 1]
        except Exception as e:
            texcoords = None

        try:
            normals = _mesh.vertex_normals
        except Exception as e:
            normals = None

        # trimesh only support vertex uv...
        faces = tfaces = nfaces = _mesh.faces

        mesh.v = torch.tensor(vertices, dtype=torch.float32, device=device)
        mesh.vt = (
            torch.tensor(texcoords, dtype=torch.float32, device=device)
            if texcoords is not None
            else None
        )
        mesh.vn = (
            torch.tensor(normals, dtype=torch.float32, device=device)
            if normals is not None
            else None
        )

        mesh.f = torch.tensor(faces, dtype=torch.int32, device=device)
        mesh.ft = (
            torch.tensor(tfaces, dtype=torch.int32, device=device)
            if texcoords is not None
            else None
        )
        mesh.fn = (
            torch.tensor(nfaces, dtype=torch.int32, device=device)
            if normals is not None
            else None
        )

        return mesh

    # aabb
    def aabb(self):
        return torch.min(self.v, dim=0).values, torch.max(self.v, dim=0).values

    # unit size
    @torch.no_grad()
    def auto_size(self):
        vmin, vmax = self.aabb()
        self.ori_center = (vmax + vmin) / 2
        self.ori_scale = 1.2 / torch.max(vmax - vmin).item()
        self.v = (self.v - self.ori_center) * self.ori_scale

    def auto_normal(self):
        i0, i1, i2 = self.f[:, 0].long(), self.f[:, 1].long(), self.f[:, 2].long()
        v0, v1, v2 = self.v[i0, :], self.v[i1, :], self.v[i2, :]

        face_normals = torch.cross(v1 - v0, v2 - v0)

        # Splat face normals to vertices
        vn = torch.zeros_like(self.v)
        vn.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
        vn.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
        vn.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)

        # Normalize, replace zero (degenerated) normals with some default value
        vn = torch.where(
            dot(vn, vn) > 1e-20,
            vn,
            torch.tensor([0.0, 0.0, 1.0], dtype=torch.float32, device=vn.device),
        )
        vn = safe_normalize(vn)

        self.vn = vn
        self.fn = self.f

    def auto_uv(self, cache_path=None, vmap=True):
        # try to load cache
        if cache_path is not None:
            cache_path = os.path.splitext(cache_path)[0] + "_uv.npz"
        if cache_path is not None and os.path.exists(cache_path):
            data = np.load(cache_path)
            vt_np, ft_np, vmapping = data["vt"], data["ft"], data["vmapping"]
        else:
            import xatlas

            v_np = self.v.detach().cpu().numpy()
            f_np = self.f.detach().int().cpu().numpy()
            atlas = xatlas.Atlas()
            atlas.add_mesh(v_np, f_np)
            chart_options = xatlas.ChartOptions()
            # chart_options.max_iterations = 4
            atlas.generate(chart_options=chart_options)
            vmapping, ft_np, vt_np = atlas[0]  # [N], [M, 3], [N, 2]

            # save to cache
            if cache_path is not None:
                np.savez(cache_path, vt=vt_np, ft=ft_np, vmapping=vmapping)

        vt = torch.from_numpy(vt_np.astype(np.float32)).to(self.device)
        ft = torch.from_numpy(ft_np.astype(np.int32)).to(self.device)
        self.vt = vt
        self.ft = ft

        if vmap:
            # remap v/f to vt/ft, so each v correspond to a unique vt. (necessary for gltf)
            vmapping = (
                torch.from_numpy(vmapping.astype(np.int64)).long().to(self.device)
            )
            self.align_v_to_vt(vmapping)

    def align_v_to_vt(self, vmapping=None):
        # remap v/f and vn/vn to vt/ft.
        if vmapping is None:
            ft = self.ft.view(-1).long()
            f = self.f.view(-1).long()
            vmapping = torch.zeros(
                self.vt.shape[0], dtype=torch.long, device=self.device
            )
            vmapping[ft] = f  # scatter, randomly choose one if index is not unique

        self.v = self.v[vmapping]
        self.f = self.ft
        # assume fn == f
        if self.vn is not None:
            self.vn = self.vn[vmapping]
            self.fn = self.ft

    def to(self, device):
        self.device = device
        for name in ["v", "f", "vn", "fn", "vt", "ft", "albedo"]:
            tensor = getattr(self, name)
            if tensor is not None:
                setattr(self, name, tensor.to(device))
        return self

    def write(self, path):
        if path.endswith(".ply"):
            self.write_ply(path)
        elif path.endswith(".obj"):
            self.write_obj(path)
        elif path.endswith(".glb") or path.endswith(".gltf"):
            self.write_glb(path)
        else:
            raise NotImplementedError(f"format {path} not supported!")

    # write to ply file (only geom)
    def write_ply(self, path):
        v_np = self.v.detach().cpu().numpy()
        f_np = self.f.detach().cpu().numpy()

        _mesh = trimesh.Trimesh(vertices=v_np, faces=f_np)
        _mesh.export(path)

    # write to gltf/glb file (geom + texture)
    def write_glb(self, path):
        assert (
            self.vn is not None and self.vt is not None
        )  # should be improved to support export without texture...

        # assert self.v.shape[0] == self.vn.shape[0] and self.v.shape[0] == self.vt.shape[0]
        if self.v.shape[0] != self.vt.shape[0]:
            self.align_v_to_vt()

        # assume f == fn == ft

        import pygltflib

        f_np = self.f.detach().cpu().numpy().astype(np.uint32)
        v_np = self.v.detach().cpu().numpy().astype(np.float32)
        # vn_np = self.vn.detach().cpu().numpy().astype(np.float32)
        vt_np = self.vt.detach().cpu().numpy().astype(np.float32)

        albedo = self.albedo.detach().cpu().numpy()
        albedo = (albedo * 255).astype(np.uint8)
        albedo = cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR)

        f_np_blob = f_np.flatten().tobytes()
        v_np_blob = v_np.tobytes()
        # vn_np_blob = vn_np.tobytes()
        vt_np_blob = vt_np.tobytes()
        albedo_blob = cv2.imencode(".png", albedo)[1].tobytes()

        gltf = pygltflib.GLTF2(
            scene=0,
            scenes=[pygltflib.Scene(nodes=[0])],
            nodes=[pygltflib.Node(mesh=0)],
            meshes=[
                pygltflib.Mesh(
                    primitives=[
                        pygltflib.Primitive(
                            # indices to accessors (0 is triangles)
                            attributes=pygltflib.Attributes(
                                POSITION=1,
                                TEXCOORD_0=2,
                            ),
                            indices=0,
                            material=0,
                        )
                    ]
                )
            ],
            materials=[
                pygltflib.Material(
                    pbrMetallicRoughness=pygltflib.PbrMetallicRoughness(
                        baseColorTexture=pygltflib.TextureInfo(index=0, texCoord=0),
                        metallicFactor=0.0,
                        roughnessFactor=1.0,
                    ),
                    alphaCutoff=0,
                    doubleSided=True,
                )
            ],
            textures=[
                pygltflib.Texture(sampler=0, source=0),
            ],
            samplers=[
                pygltflib.Sampler(
                    magFilter=pygltflib.LINEAR,
                    minFilter=pygltflib.LINEAR_MIPMAP_LINEAR,
                    wrapS=pygltflib.REPEAT,
                    wrapT=pygltflib.REPEAT,
                ),
            ],
            images=[
                # use embedded (buffer) image
                pygltflib.Image(bufferView=3, mimeType="image/png"),
            ],
            buffers=[
                pygltflib.Buffer(
                    byteLength=len(f_np_blob)
                    + len(v_np_blob)
                    + len(vt_np_blob)
                    + len(albedo_blob)
                )
            ],
            # buffer view (based on dtype)
            bufferViews=[
                # triangles; as flatten (element) array
                pygltflib.BufferView(
                    buffer=0,
                    byteLength=len(f_np_blob),
                    target=pygltflib.ELEMENT_ARRAY_BUFFER,  # GL_ELEMENT_ARRAY_BUFFER (34963)
                ),
                # positions; as vec3 array
                pygltflib.BufferView(
                    buffer=0,
                    byteOffset=len(f_np_blob),
                    byteLength=len(v_np_blob),
                    byteStride=12,  # vec3
                    target=pygltflib.ARRAY_BUFFER,  # GL_ARRAY_BUFFER (34962)
                ),
                # texcoords; as vec2 array
                pygltflib.BufferView(
                    buffer=0,
                    byteOffset=len(f_np_blob) + len(v_np_blob),
                    byteLength=len(vt_np_blob),
                    byteStride=8,  # vec2
                    target=pygltflib.ARRAY_BUFFER,
                ),
                # texture; as none target
                pygltflib.BufferView(
                    buffer=0,
                    byteOffset=len(f_np_blob) + len(v_np_blob) + len(vt_np_blob),
                    byteLength=len(albedo_blob),
                ),
            ],
            accessors=[
                # 0 = triangles
                pygltflib.Accessor(
                    bufferView=0,
                    componentType=pygltflib.UNSIGNED_INT,  # GL_UNSIGNED_INT (5125)
                    count=f_np.size,
                    type=pygltflib.SCALAR,
                    max=[int(f_np.max())],
                    min=[int(f_np.min())],
                ),
                # 1 = positions
                pygltflib.Accessor(
                    bufferView=1,
                    componentType=pygltflib.FLOAT,  # GL_FLOAT (5126)
                    count=len(v_np),
                    type=pygltflib.VEC3,
                    max=v_np.max(axis=0).tolist(),
                    min=v_np.min(axis=0).tolist(),
                ),
                # 2 = texcoords
                pygltflib.Accessor(
                    bufferView=2,
                    componentType=pygltflib.FLOAT,
                    count=len(vt_np),
                    type=pygltflib.VEC2,
                    max=vt_np.max(axis=0).tolist(),
                    min=vt_np.min(axis=0).tolist(),
                ),
            ],
        )

        # set actual data
        gltf.set_binary_blob(f_np_blob + v_np_blob + vt_np_blob + albedo_blob)

        # glb = b"".join(gltf.save_to_bytes())
        gltf.save(path)

    # write to obj file (geom + texture)
    def write_obj(self, path):
        mtl_path = path.replace(".obj", ".mtl")
        albedo_path = path.replace(".obj", "_albedo.png")

        v_np = self.v.detach().cpu().numpy()
        vt_np = self.vt.detach().cpu().numpy() if self.vt is not None else None
        vn_np = self.vn.detach().cpu().numpy() if self.vn is not None else None
        f_np = self.f.detach().cpu().numpy()
        ft_np = self.ft.detach().cpu().numpy() if self.ft is not None else None
        fn_np = self.fn.detach().cpu().numpy() if self.fn is not None else None

        with open(path, "w") as fp:
            fp.write(f"mtllib {os.path.basename(mtl_path)} \n")

            for v in v_np:
                fp.write(f"v {v[0]} {v[1]} {v[2]} \n")

            if vt_np is not None:
                for v in vt_np:
                    fp.write(f"vt {v[0]} {1 - v[1]} \n")

            if vn_np is not None:
                for v in vn_np:
                    fp.write(f"vn {v[0]} {v[1]} {v[2]} \n")

            fp.write(f"usemtl defaultMat \n")
            for i in range(len(f_np)):
                fp.write(
                    f'f {f_np[i, 0] + 1}/{ft_np[i, 0] + 1 if ft_np is not None else ""}/{fn_np[i, 0] + 1 if fn_np is not None else ""} \
                             {f_np[i, 1] + 1}/{ft_np[i, 1] + 1 if ft_np is not None else ""}/{fn_np[i, 1] + 1 if fn_np is not None else ""} \
                             {f_np[i, 2] + 1}/{ft_np[i, 2] + 1 if ft_np is not None else ""}/{fn_np[i, 2] + 1 if fn_np is not None else ""} \n'
                )

        with open(mtl_path, "w") as fp:
            fp.write(f"newmtl defaultMat \n")
            fp.write(f"Ka 1 1 1 \n")
            fp.write(f"Kd 1 1 1 \n")
            fp.write(f"Ks 0 0 0 \n")
            fp.write(f"Tr 1 \n")
            fp.write(f"illum 1 \n")
            fp.write(f"Ns 0 \n")
            fp.write(f"map_Kd {os.path.basename(albedo_path)} \n")

        if not (False or self.albedo is None):
            albedo = self.albedo.detach().cpu().numpy()
            albedo = (albedo * 255).astype(np.uint8)
            cv2.imwrite(albedo_path, cv2.cvtColor(albedo, cv2.COLOR_RGB2BGR))


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/scene/mesh_utils.py
================================================
import numpy as np
import pymeshlab as pml


def poisson_mesh_reconstruction(points, normals=None):
    # points/normals: [N, 3] np.ndarray

    import open3d as o3d

    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points)

    # outlier removal
    pcd, ind = pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=10)

    # normals
    if normals is None:
        pcd.estimate_normals()
    else:
        pcd.normals = o3d.utility.Vector3dVector(normals[ind])

    # visualize
    o3d.visualization.draw_geometries([pcd], point_show_normal=False)

    mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
        pcd, depth=9
    )
    vertices_to_remove = densities < np.quantile(densities, 0.1)
    mesh.remove_vertices_by_mask(vertices_to_remove)

    # visualize
    o3d.visualization.draw_geometries([mesh])

    vertices = np.asarray(mesh.vertices)
    triangles = np.asarray(mesh.triangles)

    print(
        f"[INFO] poisson mesh reconstruction: {points.shape} --> {vertices.shape} / {triangles.shape}"
    )

    return vertices, triangles


def decimate_mesh(
    verts, faces, target, backend="pymeshlab", remesh=False, optimalplacement=True
):
    # optimalplacement: default is True, but for flat mesh must turn False to prevent spike artifect.

    _ori_vert_shape = verts.shape
    _ori_face_shape = faces.shape

    if backend == "pyfqmr":
        import pyfqmr

        solver = pyfqmr.Simplify()
        solver.setMesh(verts, faces)
        solver.simplify_mesh(target_count=target, preserve_border=False, verbose=False)
        verts, faces, normals = solver.getMesh()
    else:
        m = pml.Mesh(verts, faces)
        ms = pml.MeshSet()
        ms.add_mesh(m, "mesh")  # will copy!

        # filters
        # ms.meshing_decimation_clustering(threshold=pml.PercentageValue(1))
        ms.meshing_decimation_quadric_edge_collapse(
            targetfacenum=int(target), optimalplacement=optimalplacement
        )

        if remesh:
            # ms.apply_coord_taubin_smoothing()
            ms.meshing_isotropic_explicit_remeshing(
                iterations=3, targetlen=pml.PercentageValue(1)
            )

        # extract mesh
        m = ms.current_mesh()
        verts = m.vertex_matrix()
        faces = m.face_matrix()

    print(
        f"[INFO] mesh decimation: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
    )

    return verts, faces


def clean_mesh(
    verts,
    faces,
    v_pct=1,
    min_f=64,
    min_d=20,
    repair=True,
    remesh=True,
    remesh_size=0.01,
):
    # verts: [N, 3]
    # faces: [N, 3]

    _ori_vert_shape = verts.shape
    _ori_face_shape = faces.shape

    m = pml.Mesh(verts, faces)
    ms = pml.MeshSet()
    ms.add_mesh(m, "mesh")  # will copy!

    # filters
    ms.meshing_remove_unreferenced_vertices()  # verts not refed by any faces

    if v_pct > 0:
        ms.meshing_merge_close_vertices(
            threshold=pml.PercentageValue(v_pct)
        )  # 1/10000 of bounding box diagonal

    ms.meshing_remove_duplicate_faces()  # faces defined by the same verts
    ms.meshing_remove_null_faces()  # faces with area == 0

    if min_d > 0:
        ms.meshing_remove_connected_component_by_diameter(
            mincomponentdiag=pml.PercentageValue(min_d)
        )

    if min_f > 0:
        ms.meshing_remove_connected_component_by_face_number(mincomponentsize=min_f)

    if repair:
        # ms.meshing_remove_t_vertices(method=0, threshold=40, repeat=True)
        ms.meshing_repair_non_manifold_edges(method=0)
        ms.meshing_repair_non_manifold_vertices(vertdispratio=0)

    if remesh:
        # ms.apply_coord_taubin_smoothing()
        ms.meshing_isotropic_explicit_remeshing(
            iterations=3, targetlen=pml.PureValue(remesh_size)
        )

    # extract mesh
    m = ms.current_mesh()
    verts = m.vertex_matrix()
    faces = m.face_matrix()

    print(
        f"[INFO] mesh cleaning: {_ori_vert_shape} --> {verts.shape}, {_ori_face_shape} --> {faces.shape}"
    )

    return verts, faces


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/camera_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

from motionrep.gaussian_3d.scene.cameras import Camera
import numpy as np
from motionrep.gaussian_3d.utils.general_utils import PILtoTorch
from motionrep.gaussian_3d.utils.graphics_utils import fov2focal
import torch

WARNED = False


def loadCam(args, id, cam_info, resolution_scale):
    orig_w, orig_h = cam_info.image.size

    if args.resolution in [1, 2, 4, 8]:
        resolution = round(orig_w / (resolution_scale * args.resolution)), round(
            orig_h / (resolution_scale * args.resolution)
        )
    else:  # should be a type that converts to float
        if args.resolution == -1:
            if orig_w > 1600:
                global WARNED
                if not WARNED:
                    print(
                        "[ INFO ] Encountered quite large input images (>1.6K pixels width), rescaling to 1.6K.\n "
                        "If this is not desired, please explicitly specify '--resolution/-r' as 1"
                    )
                    WARNED = True
                global_down = orig_w / 1600
            else:
                global_down = 1
        else:
            global_down = orig_w / args.resolution

        scale = float(global_down) * float(resolution_scale)
        resolution = (int(orig_w / scale), int(orig_h / scale))

    resized_image_rgb = PILtoTorch(cam_info.image, resolution)

    gt_image = resized_image_rgb[:3, ...]
    loaded_mask = None

    if resized_image_rgb.shape[1] == 4:
        loaded_mask = resized_image_rgb[3:4, ...]

    return Camera(
        colmap_id=cam_info.uid,
        R=cam_info.R,
        T=cam_info.T,
        FoVx=cam_info.FovX,
        FoVy=cam_info.FovY,
        image=gt_image,
        gt_alpha_mask=loaded_mask,
        image_name=cam_info.image_name,
        uid=id,
        data_device=args.data_device,
    )


def cameraList_from_camInfos(cam_infos, resolution_scale, args):
    camera_list = []

    for id, c in enumerate(cam_infos):
        camera_list.append(loadCam(args, id, c, resolution_scale))

    return camera_list


def camera_to_JSON(id, camera: Camera):
    Rt = np.zeros((4, 4))
    Rt[:3, :3] = camera.R.transpose()
    Rt[:3, 3] = camera.T
    Rt[3, 3] = 1.0

    W2C = np.linalg.inv(Rt)
    pos = W2C[:3, 3]
    rot = W2C[:3, :3]
    serializable_array_2d = [x.tolist() for x in rot]
    camera_entry = {
        "id": id,
        "img_name": camera.image_name,
        "width": camera.width,
        "height": camera.height,
        "position": pos.tolist(),
        "rotation": serializable_array_2d,
        "fy": fov2focal(camera.FovY, camera.height),
        "fx": fov2focal(camera.FovX, camera.width),
    }
    return camera_entry


def look_at(from_point, to_point, up_vector=(0, 1, 0)):
    """
    Compute the look-at matrix for a camera.

    :param from_point: The position of the camera.
    :param to_point: The point the camera is looking at.
    :param up_vector: The up direction of the camera.
    :return: The 4x4 look-at matrix.
    """

    # minus z for opengl. z for colmap
    forward = np.array(to_point) - np.array(from_point)
    forward = forward / (np.linalg.norm(forward) + 1e-5)

    # x-axis
    # Right direction is the cross product of the forward vector and the up vector
    right = np.cross(up_vector, forward)
    right = right / (np.linalg.norm(right) + 1e-5)

    # y axis
    # True up direction is the cross product of the right vector and the forward vector
    true_up = np.cross(forward, right)
    true_up = true_up / (np.linalg.norm(true_up) + 1e-5)

    # camera to world
    rotation = np.array(
        [
            [right[0], true_up[0], forward[0]],
            [right[1], true_up[1], forward[1]],
            [right[2], true_up[2], forward[2]],
        ]
    )

    # Construct the translation matrix
    translation = np.array(
        [
            [-from_point[0]],
            [-from_point[1]],
            [-from_point[2]],
        ]
    )

    # Combine the rotation and translation to get the look-at matrix
    T = 1.0 * rotation.transpose() @ translation

    return rotation.transpose(), T


def create_cameras_around_sphere(
    radius=6,
    elevation=0,
    fovx=35,
    resolutions=(720, 1080),
    num_cams=60,
    center=(0, 0, 0),
):
    """
    Create cameras around a sphere.

    :param radius: The radius of the circle on which cameras are placed.
    :param elevation: The elevation angle in degrees.
    :param fovx: The horizontal field of view of the cameras.
    :param resolutions: The resolution of the cameras.
    :param num_cams: The number of cameras.
    :param center: The center of the sphere.
    :return: A list of camera extrinsics (world2camera transformations).
    """
    extrinsics = []

    # Convert elevation to radians
    elevation_rad = np.radians(elevation)

    # Compute the y-coordinate of the cameras based on the elevation
    z = radius * np.sin(elevation_rad)

    # Compute the radius of the circle at the given elevation
    circle_radius = radius * np.cos(elevation_rad)

    for i in range(num_cams):
        # Compute the angle for the current camera
        angle = 2 * np.pi * i / num_cams

        # Compute the x and z coordinates of the camera
        x = circle_radius * np.cos(angle) + center[0]
        y = circle_radius * np.sin(angle) + center[1]

        # Create the look-at matrix for the camera
        R, T = look_at((x, y, z + center[2]), center)
        extrinsics.append([R, T.squeeze(axis=-1)])

    cam_list = []
    dummy_image = torch.tensor(
        np.zeros((3, resolutions[0], resolutions[1]), dtype=np.uint8)
    )
    for i in range(num_cams):
        R, T = extrinsics[i]

        # R is stored transposed due to 'glm' in CUDA code
        R = R.transpose()
        cam = Camera(
            colmap_id=i,
            R=R,
            T=T,
            FoVx=fovx,
            FoVy=fovx * resolutions[1] / resolutions[0],
            image_name="",
            uid=i,
            data_device="cuda",
            image=dummy_image,
            gt_alpha_mask=None,
        )

        cam_list.append(cam)

    return cam_list


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/general_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import sys
from datetime import datetime
import numpy as np
import random

def inverse_sigmoid(x):
    return torch.log(x/(1-x))

def PILtoTorch(pil_image, resolution):
    resized_image_PIL = pil_image.resize(resolution)
    resized_image = torch.from_numpy(np.array(resized_image_PIL)) / 255.0
    if len(resized_image.shape) == 3:
        return resized_image.permute(2, 0, 1)
    else:
        return resized_image.unsqueeze(dim=-1).permute(2, 0, 1)

def get_expon_lr_func(
    lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000
):
    """
    Copied from Plenoxels

    Continuous learning rate decay function. Adapted from JaxNeRF
    The returned rate is lr_init when step=0 and lr_final when step=max_steps, and
    is log-linearly interpolated elsewhere (equivalent to exponential decay).
    If lr_delay_steps>0 then the learning rate will be scaled by some smooth
    function of lr_delay_mult, such that the initial learning rate is
    lr_init*lr_delay_mult at the beginning of optimization but will be eased back
    to the normal learning rate when steps>lr_delay_steps.
    :param conf: config subtree 'lr' or similar
    :param max_steps: int, the number of steps during optimization.
    :return HoF which takes step as input
    """

    def helper(step):
        if step < 0 or (lr_init == 0.0 and lr_final == 0.0):
            # Disable this parameter
            return 0.0
        if lr_delay_steps > 0:
            # A kind of reverse cosine decay.
            delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
                0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1)
            )
        else:
            delay_rate = 1.0
        t = np.clip(step / max_steps, 0, 1)
        log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
        return delay_rate * log_lerp

    return helper

def strip_lowerdiag(L):
    uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")

    uncertainty[:, 0] = L[:, 0, 0]
    uncertainty[:, 1] = L[:, 0, 1]
    uncertainty[:, 2] = L[:, 0, 2]
    uncertainty[:, 3] = L[:, 1, 1]
    uncertainty[:, 4] = L[:, 1, 2]
    uncertainty[:, 5] = L[:, 2, 2]
    return uncertainty

def strip_symmetric(sym):
    return strip_lowerdiag(sym)

def build_rotation(r):
    norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])

    q = r / norm[:, None]

    R = torch.zeros((q.size(0), 3, 3), device='cuda')

    r = q[:, 0]
    x = q[:, 1]
    y = q[:, 2]
    z = q[:, 3]

    R[:, 0, 0] = 1 - 2 * (y*y + z*z)
    R[:, 0, 1] = 2 * (x*y - r*z)
    R[:, 0, 2] = 2 * (x*z + r*y)
    R[:, 1, 0] = 2 * (x*y + r*z)
    R[:, 1, 1] = 1 - 2 * (x*x + z*z)
    R[:, 1, 2] = 2 * (y*z - r*x)
    R[:, 2, 0] = 2 * (x*z - r*y)
    R[:, 2, 1] = 2 * (y*z + r*x)
    R[:, 2, 2] = 1 - 2 * (x*x + y*y)
    return R

def build_scaling_rotation(s, r):
    L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda")
    R = build_rotation(r)

    L[:,0,0] = s[:,0]
    L[:,1,1] = s[:,1]
    L[:,2,2] = s[:,2]

    L = R @ L
    return L

def safe_state(silent):
    old_f = sys.stdout
    class F:
        def __init__(self, silent):
            self.silent = silent

        def write(self, x):
            if not self.silent:
                if x.endswith("\n"):
                    old_f.write(x.replace("\n", " [{}]\n".format(str(datetime.now().strftime("%d/%m %H:%M:%S")))))
                else:
                    old_f.write(x)

        def flush(self):
            old_f.flush()

    sys.stdout = F(silent)

    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)
    torch.cuda.set_device(torch.device("cuda:0"))


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/graphics_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import math
import numpy as np
from typing import NamedTuple

class BasicPointCloud(NamedTuple):
    points : np.array
    colors : np.array
    normals : np.array

def geom_transform_points(points, transf_matrix):
    P, _ = points.shape
    ones = torch.ones(P, 1, dtype=points.dtype, device=points.device)
    points_hom = torch.cat([points, ones], dim=1)
    points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0))

    denom = points_out[..., 3:] + 0.0000001
    return (points_out[..., :3] / denom).squeeze(dim=0)

def getWorld2View(R, t):
    Rt = np.zeros((4, 4))
    Rt[:3, :3] = R.transpose()
    Rt[:3, 3] = t
    Rt[3, 3] = 1.0
    return np.float32(Rt)

def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0):
    Rt = np.zeros((4, 4))
    Rt[:3, :3] = R.transpose()
    Rt[:3, 3] = t
    Rt[3, 3] = 1.0

    C2W = np.linalg.inv(Rt)
    cam_center = C2W[:3, 3]
    cam_center = (cam_center + translate) * scale
    C2W[:3, 3] = cam_center
    Rt = np.linalg.inv(C2W)
    return np.float32(Rt)

def getProjectionMatrix(znear, zfar, fovX, fovY):
    tanHalfFovY = math.tan((fovY / 2))
    tanHalfFovX = math.tan((fovX / 2))

    top = tanHalfFovY * znear
    bottom = -top
    right = tanHalfFovX * znear
    left = -right

    P = torch.zeros(4, 4)

    z_sign = 1.0

    P[0, 0] = 2.0 * znear / (right - left)
    P[1, 1] = 2.0 * znear / (top - bottom)
    P[0, 2] = (right + left) / (right - left)
    P[1, 2] = (top + bottom) / (top - bottom)
    P[3, 2] = z_sign
    P[2, 2] = z_sign * zfar / (zfar - znear)
    P[2, 3] = -(zfar * znear) / (zfar - znear)
    return P

def fov2focal(fov, pixels):
    return pixels / (2 * math.tan(fov / 2))

def focal2fov(focal, pixels):
    return 2*math.atan(pixels/(2*focal))

================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/image_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch

def mse(img1, img2):
    return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)

def psnr(img1, img2):
    mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
    return 20 * torch.log10(1.0 / torch.sqrt(mse))


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/loss_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import torch
import torch.nn.functional as F
from torch.autograd import Variable
from math import exp

def l1_loss(network_output, gt):
    return torch.abs((network_output - gt)).mean()

def l2_loss(network_output, gt):
    return ((network_output - gt) ** 2).mean()

def gaussian(window_size, sigma):
    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
    return gauss / gauss.sum()

def create_window(window_size, channel):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
    return window

def ssim(img1, img2, window_size=11, size_average=True):
    channel = img1.size(-3)
    window = create_window(window_size, channel)

    if img1.is_cuda:
        window = window.cuda(img1.get_device())
    window = window.type_as(img1)

    return _ssim(img1, img2, window, window_size, channel, size_average)

def _ssim(img1, img2, window, window_size, channel, size_average=True):
    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2

    C1 = 0.01 ** 2
    C2 = 0.03 ** 2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))

    if size_average:
        return ssim_map.mean()
    else:
        return ssim_map.mean(1).mean(1).mean(1)


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/rigid_body_utils.py
================================================
import torch
import torch.nn.functional as F


def get_rigid_transform(A, B):
    """
    Estimate the rigid body transformation between two sets of 3D points.
    A and B are Nx3 matrices where each row is a 3D point.
    Returns a rotation matrix R and translation vector t.
    Args:
        A, B: [batch, N, 3] matrix of 3D points
    Outputs:
        R, t: [batch, 3, 3/1]
        target = R @ source (source shape [3, 1]) + t
    """
    assert A.shape == B.shape, "Input matrices must have the same shape"
    assert A.shape[-1] == 3, "Input matrices must have 3 columns (x, y, z coordinates)"

    # Compute centroids. [..., 1, 3]
    centroid_A = torch.mean(A, dim=-2, keepdim=True)
    centroid_B = torch.mean(B, dim=-2, keepdim=True)

    # Center the point sets
    A_centered = A - centroid_A
    B_centered = B - centroid_B

    # Compute the cross-covariance matrix. [..., 3, 3]
    H = A_centered.transpose(-2, -1) @ B_centered

    # Compute the Singular Value Decomposition. Along last two dimensions
    U, S, Vt = torch.linalg.svd(H)

    # Compute the rotation matrix
    R = Vt.transpose(-2, -1) @ U.transpose(-2, -1)

    # Ensure a right-handed coordinate system
    flip_mask = (torch.det(R) < 0) * -2.0 + 1.0
    # Vt[:, 2, :] *= flip_mask[..., None]

    # [N] => [N, 3]
    pad_flip_mask = torch.stack(
        [torch.ones_like(flip_mask), torch.ones_like(flip_mask), flip_mask], dim=-1
    )
    Vt = Vt * pad_flip_mask[..., None]

    # Compute the rotation matrix
    R = Vt.transpose(-2, -1) @ U.transpose(-2, -1)

    # print(R.shape, centroid_A.shape, centroid_B.shape, flip_mask.shape)
    # Compute the translation
    t = centroid_B - (R @ centroid_A.transpose(-2, -1)).transpose(-2, -1)
    t = t.transpose(-2, -1)
    return R, t


def _test_rigid_transform():
    # Example usage:
    A = torch.tensor([[1, 2, 3], [4, 5, 6], [9, 8, 10], [10, -5, 1]]) * 1.0

    R_synthesized = torch.tensor([[1, 0, 0], [0, -1, 0], [0, 0, -1]]) * 1.0
    # init a random rotation matrix:

    B = (R_synthesized @ A.T).T + 2.0  # Just an example offset

    R, t = get_rigid_transform(A[None, ...], B[None, ...])
    print("Rotation matrix R:")
    print(R)
    print("\nTranslation vector t:")
    print(t)


def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
    """
    Returns torch.sqrt(torch.max(0, x))
    but with a zero subgradient where x is 0.
    """
    ret = torch.zeros_like(x)
    positive_mask = x > 0
    ret[positive_mask] = torch.sqrt(x[positive_mask])
    return ret


def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
    """
    from pytorch3d. Based on trace_method like: https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L205
    Convert rotations given as rotation matrices to quaternions.

    Args:
        matrix: Rotation matrices as tensor of shape (..., 3, 3).

    Returns:
        quaternions with real part first, as tensor of shape (..., 4).
    """
    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")

    batch_dim = matrix.shape[:-2]
    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
        matrix.reshape(batch_dim + (9,)), dim=-1
    )

    q_abs = _sqrt_positive_part(
        torch.stack(
            [
                1.0 + m00 + m11 + m22,
                1.0 + m00 - m11 - m22,
                1.0 - m00 + m11 - m22,
                1.0 - m00 - m11 + m22,
            ],
            dim=-1,
        )
    )

    # we produce the desired quaternion multiplied by each of r, i, j, k
    quat_by_rijk = torch.stack(
        [
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
        ],
        dim=-2,
    )

    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
    # the candidate won't be picked.
    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))

    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
    # forall i; we pick the best-conditioned one (with the largest denominator)

    return quat_candidates[
        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
    ].reshape(batch_dim + (4,))


def quternion_to_matrix(r):
    norm = torch.sqrt(
        r[:, 0] * r[:, 0] + r[:, 1] * r[:, 1] + r[:, 2] * r[:, 2] + r[:, 3] * r[:, 3]
    )

    q = r / norm[:, None]

    R = torch.zeros((q.size(0), 3, 3), device="cuda")

    r = q[:, 0]
    x = q[:, 1]
    y = q[:, 2]
    z = q[:, 3]

    R[:, 0, 0] = 1 - 2 * (y * y + z * z)
    R[:, 0, 1] = 2 * (x * y - r * z)
    R[:, 0, 2] = 2 * (x * z + r * y)
    R[:, 1, 0] = 2 * (x * y + r * z)
    R[:, 1, 1] = 1 - 2 * (x * x + z * z)
    R[:, 1, 2] = 2 * (y * z - r * x)
    R[:, 2, 0] = 2 * (x * z - r * y)
    R[:, 2, 1] = 2 * (y * z + r * x)
    R[:, 2, 2] = 1 - 2 * (x * x + y * y)
    return R


def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
    """
    from Pytorch3d
    Convert a unit quaternion to a standard form: one in which the real
    part is non negative.

    Args:
        quaternions: Quaternions with real part first,
            as tensor of shape (..., 4).

    Returns:
        Standardized quaternions as tensor of shape (..., 4).
    """
    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)


def quaternion_multiply(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    """
    From pytorch3d
    Multiply two quaternions.
    Usual torch rules for broadcasting apply.

    Args:
        a: Quaternions as tensor of shape (..., 4), real part first.
        b: Quaternions as tensor of shape (..., 4), real part first.

    Returns:
        The product of a and b, a tensor of quaternions shape (..., 4).
    """
    aw, ax, ay, az = torch.unbind(a, -1)
    bw, bx, by, bz = torch.unbind(b, -1)
    ow = aw * bw - ax * bx - ay * by - az * bz
    ox = aw * bx + ax * bw + ay * bz - az * by
    oy = aw * by - ax * bz + ay * bw + az * bx
    oz = aw * bz + ax * by - ay * bx + az * bw
    ret = torch.stack((ow, ox, oy, oz), -1)
    ret = standardize_quaternion(ret)
    return ret


def _test_matrix_to_quaternion():
    # init a random batch of quaternion
    r = torch.randn((10, 4)).cuda()

    norm = torch.sqrt(
        r[:, 0] * r[:, 0] + r[:, 1] * r[:, 1] + r[:, 2] * r[:, 2] + r[:, 3] * r[:, 3]
    )

    q = r / norm[:, None]

    q = standardize_quaternion(q)

    R = quternion_to_matrix(q)

    I_rec = R @ R.transpose(-2, -1)
    I_rec_error = torch.abs(I_rec - torch.eye(3, device="cuda")[None, ...]).max()

    q_recovered = matrix_to_quaternion(R)
    norm_ = torch.linalg.norm(q_recovered, dim=-1)
    q_recovered = q_recovered / norm_[..., None]
    q_recovered = standardize_quaternion(q_recovered)

    print(q_recovered.shape, q.shape, R.shape)

    rec = (q - q_recovered).abs().max()

    print("rotation to I error:", I_rec_error, "quant rec error: ", rec)


def _test_matrix_to_quaternion_2():
    R = (
        torch.tensor(
            [[[1, 0, 0], [0, -1, 0], [0, 0, -1]], [[1, 0, 0], [0, 0, 1], [0, -1, 0]]]
        )
        * 1.0
    )

    q_rec = matrix_to_quaternion(R.transpose(-2, -1))

    R_rec = quternion_to_matrix(q_rec)

    print(R_rec)


if __name__ == "__main__":
    # _test_rigid_transform()
    _test_matrix_to_quaternion()

    _test_matrix_to_quaternion_2()


================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/sh_utils.py
================================================
#  Copyright 2021 The PlenOctree Authors.
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions are met:
#
#  1. Redistributions of source code must retain the above copyright notice,
#  this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright notice,
#  this list of conditions and the following disclaimer in the documentation
#  and/or other materials provided with the distribution.
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  POSSIBILITY OF SUCH DAMAGE.

import torch

C0 = 0.28209479177387814
C1 = 0.4886025119029199
C2 = [
    1.0925484305920792,
    -1.0925484305920792,
    0.31539156525252005,
    -1.0925484305920792,
    0.5462742152960396
]
C3 = [
    -0.5900435899266435,
    2.890611442640554,
    -0.4570457994644658,
    0.3731763325901154,
    -0.4570457994644658,
    1.445305721320277,
    -0.5900435899266435
]
C4 = [
    2.5033429417967046,
    -1.7701307697799304,
    0.9461746957575601,
    -0.6690465435572892,
    0.10578554691520431,
    -0.6690465435572892,
    0.47308734787878004,
    -1.7701307697799304,
    0.6258357354491761,
]   


def eval_sh(deg, sh, dirs):
    """
    Evaluate spherical harmonics at unit directions
    using hardcoded SH polynomials.
    Works with torch/np/jnp.
    ... Can be 0 or more batch dimensions.
    Args:
        deg: int SH deg. Currently, 0-3 supported
        sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2]
        dirs: jnp.ndarray unit directions [..., 3]
    Returns:
        [..., C]
    """
    assert deg <= 4 and deg >= 0
    coeff = (deg + 1) ** 2
    assert sh.shape[-1] >= coeff

    result = C0 * sh[..., 0]
    if deg > 0:
        x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3]
        result = (result -
                C1 * y * sh[..., 1] +
                C1 * z * sh[..., 2] -
                C1 * x * sh[..., 3])

        if deg > 1:
            xx, yy, zz = x * x, y * y, z * z
            xy, yz, xz = x * y, y * z, x * z
            result = (result +
                    C2[0] * xy * sh[..., 4] +
                    C2[1] * yz * sh[..., 5] +
                    C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] +
                    C2[3] * xz * sh[..., 7] +
                    C2[4] * (xx - yy) * sh[..., 8])

            if deg > 2:
                result = (result +
                C3[0] * y * (3 * xx - yy) * sh[..., 9] +
                C3[1] * xy * z * sh[..., 10] +
                C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] +
                C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] +
                C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] +
                C3[5] * z * (xx - yy) * sh[..., 14] +
                C3[6] * x * (xx - 3 * yy) * sh[..., 15])

                if deg > 3:
                    result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] +
                            C4[1] * yz * (3 * xx - yy) * sh[..., 17] +
                            C4[2] * xy * (7 * zz - 1) * sh[..., 18] +
                            C4[3] * yz * (7 * zz - 3) * sh[..., 19] +
                            C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] +
                            C4[5] * xz * (7 * zz - 3) * sh[..., 21] +
                            C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] +
                            C4[7] * xz * (xx - 3 * yy) * sh[..., 23] +
                            C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24])
    return result

def RGB2SH(rgb):
    return (rgb - 0.5) / C0

def SH2RGB(sh):
    return sh * C0 + 0.5

================================================
FILE: projects/uncleaned_train/motionrep/gaussian_3d/utils/system_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

from errno import EEXIST
from os import makedirs, path
import os

def mkdir_p(folder_path):
    # Creates a directory. equivalent to using mkdir -p on the command line
    try:
        makedirs(folder_path)
    except OSError as exc: # Python >2.5
        if exc.errno == EEXIST and path.isdir(folder_path):
            pass
        else:
            raise

def searchForMaxIteration(folder):
    saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)]
    return max(saved_iters)


================================================
FILE: projects/uncleaned_train/motionrep/losses/se3_loss.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


================================================
FILE: projects/uncleaned_train/motionrep/losses/smoothness_loss.py
================================================
import torch
from typing import Tuple


def compute_plane_tv(t: torch.Tensor, only_w: bool = False) -> float:
    """Computes total variance across a plane.
    From nerf-studio

    Args:
        t: Plane tensor
        only_w: Whether to only compute total variance across w dimension

    Returns:
        Total variance
    """
    _, h, w = t.shape
    w_tv = torch.square(t[..., :, 1:] - t[..., :, : w - 1]).mean()

    if only_w:
        return w_tv

    h_tv = torch.square(t[..., 1:, :] - t[..., : h - 1, :]).mean()
    return h_tv + w_tv


def compute_plane_smoothness(t: torch.Tensor) -> float:
    """Computes smoothness across the temporal axis of a plane
    From nerf-studio
    Args:
        t: Plane tensor

    Returns:
        Time smoothness
    """
    _, h, _ = t.shape
    # Convolve with a second derivative filter, in the time dimension which is dimension 2
    first_difference = t[..., 1:, :] - t[..., : h - 1, :]  # [c, h-1, w]
    second_difference = (
        first_difference[..., 1:, :] - first_difference[..., : h - 2, :]
    )  # [c, h-2, w]
    # Take the L2 norm of the result
    return torch.square(second_difference).mean()


================================================
FILE: projects/uncleaned_train/motionrep/operators/dct.py
================================================
"""
Code from https://github.com/zh217/torch-dct/blob/master/torch_dct/_dct.py
"""
import numpy as np
import torch
import torch.nn as nn


import torch.fft


def dct1_rfft_impl(x):
    return torch.view_as_real(torch.fft.rfft(x, dim=1))


def dct_fft_impl(v):
    return torch.view_as_real(torch.fft.fft(v, dim=1))


def idct_irfft_impl(V):
    return torch.fft.irfft(torch.view_as_complex(V), n=V.shape[1], dim=1)


def dct(x, norm=None):
    """
    Discrete Cosine Transform, Type II (a.k.a. the DCT)

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    if norm is None:
              N-1
    y[k] = 2* sum x[n]*cos(pi*k*(2n+1)/(2*N)), 0 <= k < N.
              n=0

    :param x: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last dimension
    """
    x_shape = x.shape
    N = x_shape[-1]
    x = x.contiguous().view(-1, N)

    v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1)

    Vc = dct_fft_impl(v)

    k = -torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N)
    W_r = torch.cos(k)
    W_i = torch.sin(k)

    V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i

    if norm == "ortho":
        V[:, 0] /= np.sqrt(N) * 2
        V[:, 1:] /= np.sqrt(N / 2) * 2

    V = 2 * V.view(*x_shape)

    return V


def idct(X, norm=None):
    """
    The inverse to DCT-II, which is a scaled Discrete Cosine Transform, Type III

    Our definition of idct is that idct(dct(x)) == x

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param X: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the inverse DCT-II of the signal over the last dimension
    """

    x_shape = X.shape
    N = x_shape[-1]

    X_v = X.contiguous().view(-1, x_shape[-1]) / 2

    if norm == "ortho":
        X_v[:, 0] *= np.sqrt(N) * 2
        X_v[:, 1:] *= np.sqrt(N / 2) * 2

    k = (
        torch.arange(x_shape[-1], dtype=X.dtype, device=X.device)[None, :]
        * np.pi
        / (2 * N)
    )
    W_r = torch.cos(k)
    W_i = torch.sin(k)

    V_t_r = X_v
    V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1)

    V_r = V_t_r * W_r - V_t_i * W_i
    V_i = V_t_r * W_i + V_t_i * W_r

    V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2)

    v = idct_irfft_impl(V)
    x = v.new_zeros(v.shape)
    x[:, ::2] += v[:, : N - (N // 2)]
    x[:, 1::2] += v.flip([1])[:, : N // 2]

    return x.view(*x_shape)


def dct_3d(x, norm=None):
    """
    3-dimentional Discrete Cosine Transform, Type II (a.k.a. the DCT)

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param x: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last 3 dimensions
    """
    X1 = dct(x, norm=norm)
    X2 = dct(X1.transpose(-1, -2), norm=norm)
    X3 = dct(X2.transpose(-1, -3), norm=norm)
    return X3.transpose(-1, -3).transpose(-1, -2)


def idct_3d(X, norm=None):
    """
    The inverse to 3D DCT-II, which is a scaled Discrete Cosine Transform, Type III

    Our definition of idct is that idct_3d(dct_3d(x)) == x

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param X: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last 3 dimensions
    """
    x1 = idct(X, norm=norm)
    x2 = idct(x1.transpose(-1, -2), norm=norm)
    x3 = idct(x2.transpose(-1, -3), norm=norm)
    return x3.transpose(-1, -3).transpose(-1, -2)


def code_test_dct3d():
    # init a tensor of shape [100, 20, 3]
    x = torch.rand(100, 20, 3)

    dct_coef = dct_3d(x, norm="ortho")
    print("inp signal shape: ", x.shape, "  dct coef shape: ", dct_coef.shape)

    x_recon = idct_3d(dct_coef, norm="ortho")
    print("inp signal shape: ", x.shape, "  recon signal shape: ", x_recon.shape)

    print("max error: ", torch.max(torch.abs(x - x_recon)))

    dct_coef[:, 0, :] = 0

    x_recon = idct_3d(dct_coef, norm="ortho")
    print("max error after removing first order: ", torch.max(torch.abs(x - x_recon)))


if __name__ == "__main__":
    code_test_dct3d()


================================================
FILE: projects/uncleaned_train/motionrep/operators/np_operators.py
================================================
import torch
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


def feature_map_to_rgb_pca(feature_map):
    """
    Args:
        feature_map: (C, H, W) feature map.
    Outputs:
        rgb_image: (H, W, 3) image.
    """
    # Move feature map to CPU and convert to numpy
    if isinstance(feature_map, torch.Tensor):
        feature_map = feature_map.detach().cpu().numpy()

    H, W = feature_map.shape[1:]
    # Flatten spatial dimensions  # [N, C]
    flattened_map = feature_map.reshape(feature_map.shape[0], -1).T

    # Apply PCA and reduce channel dimension to 3
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(flattened_map)

    # Reshape back to (H, W, 3)
    rgb_image = pca_result.reshape(H, W, 3)

    # Normalize to [0, 1]
    rgb_image = (rgb_image - rgb_image.min()) / (
        rgb_image.max() - rgb_image.min() + 1e-3
    )

    return rgb_image


================================================
FILE: projects/uncleaned_train/motionrep/operators/rotation.py
================================================
from typing import Optional

import torch
import torch.nn.functional as F


def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
    """
    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
    using Gram--Schmidt orthogonalization per Section B of [1].
    Args:
        d6: 6D rotation representation, of size (*, 6)

    Returns:
        batch of rotation matrices of size (*, 3, 3)

    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
    On the Continuity of Rotation Representations in Neural Networks.
    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
    Retrieved from http://arxiv.org/abs/1812.07035
    """

    a1, a2 = d6[..., :3], d6[..., 3:]
    b1 = F.normalize(a1, dim=-1)
    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
    b2 = F.normalize(b2, dim=-1)
    b3 = torch.cross(b1, b2, dim=-1)
    return torch.stack((b1, b2, b3), dim=-2)


def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
    """
    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
    by dropping the last row. Note that 6D representation is not unique.
    Args:
        matrix: batch of rotation matrices of size (*, 3, 3)

    Returns:
        6D rotation representation, of size (*, 6)

    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
    On the Continuity of Rotation Representations in Neural Networks.
    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
    Retrieved from http://arxiv.org/abs/1812.07035
    """
    batch_dim = matrix.size()[:-2]
    return matrix[..., :2, :].clone().reshape(batch_dim + (6,))


def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
    """
    Convert rotations given as quaternions to rotation matrices.

    Args:
        quaternions: quaternions with real part first,
            as tensor of shape (..., 4).

    Returns:
        Rotation matrices as tensor of shape (..., 3, 3).
    """
    r, i, j, k = torch.unbind(quaternions, -1)
    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
    two_s = 2.0 / (quaternions * quaternions).sum(-1)

    o = torch.stack(
        (
            1 - two_s * (j * j + k * k),
            two_s * (i * j - k * r),
            two_s * (i * k + j * r),
            two_s * (i * j + k * r),
            1 - two_s * (i * i + k * k),
            two_s * (j * k - i * r),
            two_s * (i * k - j * r),
            two_s * (j * k + i * r),
            1 - two_s * (i * i + j * j),
        ),
        -1,
    )
    return o.reshape(quaternions.shape[:-1] + (3, 3))


def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
    """
    Returns torch.sqrt(torch.max(0, x))
    but with a zero subgradient where x is 0.
    """
    ret = torch.zeros_like(x)
    positive_mask = x > 0
    ret[positive_mask] = torch.sqrt(x[positive_mask])
    return ret


def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
    """
    Convert rotations given as rotation matrices to quaternions.

    Args:
        matrix: Rotation matrices as tensor of shape (..., 3, 3).

    Returns:
        quaternions with real part first, as tensor of shape (..., 4).
    """
    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")

    batch_dim = matrix.shape[:-2]
    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
        matrix.reshape(batch_dim + (9,)), dim=-1
    )

    q_abs = _sqrt_positive_part(
        torch.stack(
            [
                1.0 + m00 + m11 + m22,
                1.0 + m00 - m11 - m22,
                1.0 - m00 + m11 - m22,
                1.0 - m00 - m11 + m22,
            ],
            dim=-1,
        )
    )

    # we produce the desired quaternion multiplied by each of r, i, j, k
    quat_by_rijk = torch.stack(
        [
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
            #  `int`.
            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
        ],
        dim=-2,
    )

    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
    # the candidate won't be picked.
    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))

    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
    # forall i; we pick the best-conditioned one (with the largest denominator)

    return quat_candidates[
        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
    ].reshape(batch_dim + (4,))


================================================
FILE: projects/uncleaned_train/motionrep/utils/camera_utils.py
================================================
import numpy as np


def normalize(x: np.ndarray) -> np.ndarray:
    """Normalization helper function."""
    return x / np.linalg.norm(x)


def viewmatrix(lookdir: np.ndarray, up: np.ndarray, position: np.ndarray) -> np.ndarray:
    """Construct lookat view matrix."""
    vec2 = normalize(lookdir)
    vec0 = normalize(np.cross(up, vec2))
    vec1 = normalize(np.cross(vec2, vec0))
    m = np.stack([vec0, vec1, vec2, position], axis=1)
    return m


def generate_spiral_path(
    pose: np.ndarray,
    radius: float,
    lookat_pt: np.ndarray = np.array([0, 0, 0]),
    up: np.ndarray = np.array([0, 0, 1]),
    n_frames: int = 60,
    n_rots: int = 1,
    y_scale: float = 1.0,
) -> np.ndarray:
    """Calculates a forward facing spiral path for rendering."""
    x_axis = pose[:3, 0]
    y_axis = pose[:3, 1]
    campos = pose[:3, 3]

    render_poses = []
    for theta in np.linspace(0.0, 2 * np.pi * n_rots, n_frames, endpoint=False):
        t = (np.cos(theta) * x_axis + y_scale * np.sin(theta) * y_axis) * radius
        position = campos + t
        z_axis = position - lookat_pt
        new_pose = np.eye(4)
        new_pose[:3] = viewmatrix(z_axis, up, position)
        render_poses.append(new_pose)
    render_poses = np.stack(render_poses, axis=0)
    return render_poses


================================================
FILE: projects/uncleaned_train/motionrep/utils/colmap_utils.py
================================================
#
# Copyright (C) 2023, Inria
# GRAPHDECO research group, https://team.inria.fr/graphdeco
# All rights reserved.
#
# This software is free for non-commercial, research and evaluation use 
# under the terms of the LICENSE.md file.
#
# For inquiries contact  george.drettakis@inria.fr
#

import numpy as np
import collections
import struct

CameraModel = collections.namedtuple(
    "CameraModel", ["model_id", "model_name", "num_params"])
Camera = collections.namedtuple(
    "Camera", ["id", "model", "width", "height", "params"])
BaseImage = collections.namedtuple(
    "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
Point3D = collections.namedtuple(
    "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
CAMERA_MODELS = {
    CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
    CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
    CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
    CameraModel(model_id=3, model_name="RADIAL", num_params=5),
    CameraModel(model_id=4, model_name="OPENCV", num_params=8),
    CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
    CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
    CameraModel(model_id=7, model_name="FOV", num_params=5),
    CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
    CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
    CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
}
CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
                         for camera_model in CAMERA_MODELS])
CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model)
                           for camera_model in CAMERA_MODELS])


def qvec2rotmat(qvec):
    return np.array([
        [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
         2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
         2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
        [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
         1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
         2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
        [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
         2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
         1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])

def rotmat2qvec(R):
    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
    K = np.array([
        [Rxx - Ryy - Rzz, 0, 0, 0],
        [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
        [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
        [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
    eigvals, eigvecs = np.linalg.eigh(K)
    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
    if qvec[0] < 0:
        qvec *= -1
    return qvec

class Image(BaseImage):
    def qvec2rotmat(self):
        return qvec2rotmat(self.qvec)

def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
    """Read and unpack the next bytes from a binary file.
    :param fid:
    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
    :param endian_character: Any of {@, =, <, >, !}
    :return: Tuple of read and unpacked values.
    """
    data = fid.read(num_bytes)
    return struct.unpack(endian_character + format_char_sequence, data)

def read_points3D_text(path):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DText(const std::string& path)
        void Reconstruction::WritePoints3DText(const std::string& path)
    """
    xyzs = None
    rgbs = None
    errors = None
    num_points = 0
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                num_points += 1


    xyzs = np.empty((num_points, 3))
    rgbs = np.empty((num_points, 3))
    errors = np.empty((num_points, 1))
    count = 0
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                xyz = np.array(tuple(map(float, elems[1:4])))
                rgb = np.array(tuple(map(int, elems[4:7])))
                error = np.array(float(elems[7]))
                xyzs[count] = xyz
                rgbs[count] = rgb
                errors[count] = error
                count += 1

    return xyzs, rgbs, errors

def read_points3D_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadPoints3DBinary(const std::string& path)
        void Reconstruction::WritePoints3DBinary(const std::string& path)
    """


    with open(path_to_model_file, "rb") as fid:
        num_points = read_next_bytes(fid, 8, "Q")[0]

        xyzs = np.empty((num_points, 3))
        rgbs = np.empty((num_points, 3))
        errors = np.empty((num_points, 1))

        for p_id in range(num_points):
            binary_point_line_properties = read_next_bytes(
                fid, num_bytes=43, format_char_sequence="QdddBBBd")
            xyz = np.array(binary_point_line_properties[1:4])
            rgb = np.array(binary_point_line_properties[4:7])
            error = np.array(binary_point_line_properties[7])
            track_length = read_next_bytes(
                fid, num_bytes=8, format_char_sequence="Q")[0]
            track_elems = read_next_bytes(
                fid, num_bytes=8*track_length,
                format_char_sequence="ii"*track_length)
            xyzs[p_id] = xyz
            rgbs[p_id] = rgb
            errors[p_id] = error
    return xyzs, rgbs, errors

def read_intrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    cameras = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                camera_id = int(elems[0])
                model = elems[1]
                assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE"
                width = int(elems[2])
                height = int(elems[3])
                params = np.array(tuple(map(float, elems[4:])))
                cameras[camera_id] = Camera(id=camera_id, model=model,
                                            width=width, height=height,
                                            params=params)
    return cameras

def read_extrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::ReadImagesBinary(const std::string& path)
        void Reconstruction::WriteImagesBinary(const std::string& path)
    """
    images = {}
    with open(path_to_model_file, "rb") as fid:
        num_reg_images = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_reg_images):
            binary_image_properties = read_next_bytes(
                fid, num_bytes=64, format_char_sequence="idddddddi")
            image_id = binary_image_properties[0]
            qvec = np.array(binary_image_properties[1:5])
            tvec = np.array(binary_image_properties[5:8])
            camera_id = binary_image_properties[8]
            image_name = ""
            current_char = read_next_bytes(fid, 1, "c")[0]
            while current_char != b"\x00":   # look for the ASCII 0 entry
                image_name += current_char.decode("utf-8")
                current_char = read_next_bytes(fid, 1, "c")[0]
            num_points2D = read_next_bytes(fid, num_bytes=8,
                                           format_char_sequence="Q")[0]
            x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
                                       format_char_sequence="ddq"*num_points2D)
            xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
                                   tuple(map(float, x_y_id_s[1::3]))])
            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
            images[image_id] = Image(
                id=image_id, qvec=qvec, tvec=tvec,
                camera_id=camera_id, name=image_name,
                xys=xys, point3D_ids=point3D_ids)
    return images


def read_intrinsics_binary(path_to_model_file):
    """
    see: src/base/reconstruction.cc
        void Reconstruction::WriteCamerasBinary(const std::string& path)
        void Reconstruction::ReadCamerasBinary(const std::string& path)
    """
    cameras = {}
    with open(path_to_model_file, "rb") as fid:
        num_cameras = read_next_bytes(fid, 8, "Q")[0]
        for _ in range(num_cameras):
            camera_properties = read_next_bytes(
                fid, num_bytes=24, format_char_sequence="iiQQ")
            camera_id = camera_properties[0]
            model_id = camera_properties[1]
            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
            width = camera_properties[2]
            height = camera_properties[3]
            num_params = CAMERA_MODEL_IDS[model_id].num_params
            params = read_next_bytes(fid, num_bytes=8*num_params,
                                     format_char_sequence="d"*num_params)
            cameras[camera_id] = Camera(id=camera_id,
                                        model=model_name,
                                        width=width,
                                        height=height,
                                        params=np.array(params))
        assert len(cameras) == num_cameras
    return cameras


def read_extrinsics_text(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py
    """
    images = {}
    with open(path, "r") as fid:
        while True:
            line = fid.readline()
            if not line:
                break
            line = line.strip()
            if len(line) > 0 and line[0] != "#":
                elems = line.split()
                image_id = int(elems[0])
                qvec = np.array(tuple(map(float, elems[1:5])))
                tvec = np.array(tuple(map(float, elems[5:8])))
                camera_id = int(elems[8])
                image_name = elems[9]
                elems = fid.readline().split()
                xys = np.column_stack([tuple(map(float, elems[0::3])),
                                       tuple(map(float, elems[1::3]))])
                point3D_ids = np.array(tuple(map(int, elems[2::3])))
                images[image_id] = Image(
                    id=image_id, qvec=qvec, tvec=tvec,
                    camera_id=camera_id, name=image_name,
                    xys=xys, point3D_ids=point3D_ids)
    return images


def read_colmap_bin_array(path):
    """
    Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py

    :param path: path to the colmap binary file.
    :return: nd array with the floating point values in the value
    """
    with open(path, "rb") as fid:
        width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1,
                                                usecols=(0, 1, 2), dtype=int)
        fid.seek(0)
        num_delimiter = 0
        byte = fid.read(1)
        while True:
            if byte == b"&":
                num_delimiter += 1
                if num_delimiter >= 3:
                    break
            byte = fid.read(1)
        array = np.fromfile(fid, np.float32)
    array = array.reshape((width, height, channels), order="F")
    return np.transpose(array, (1, 0, 2)).squeeze()


================================================
FILE: projects/uncleaned_train/motionrep/utils/config.py
================================================
from omegaconf import OmegaConf


def load_config_with_merge(config_path: str):
    cfg = OmegaConf.load(config_path)

    path_ = cfg.get("_base", None)

    if path_ is not None:
        print(f"Merging base config from {path_}")
        cfg = OmegaConf.merge(load_config_with_merge(path_), cfg)
    else:
        return cfg
    return cfg


def merge_without_none(base_cfg, override_cfg):
    for key, value in override_cfg.items():
        if value is not None:
            base_cfg[key] = value
        elif not (key in base_cfg):
            base_cfg[key] = None
    return base_cfg


def create_config(config_path, args, cli_args: list = []):
    """
    Args:
        config_path: path to config file
        args: argparse object with known variables
        cli_args: list of cli args in the format of
            ["lr=0.1", "model.name=alexnet"]
    """
    # recursively merge base config
    cfg = load_config_with_merge(config_path)

    # parse cli args, and merge them into cfg
    cli_conf = OmegaConf.from_cli(cli_args)
    arg_cfg = OmegaConf.create(vars(args))

    # drop None in arg_cfg

    arg_cfg = OmegaConf.merge(arg_cfg, cli_conf)

    # cfg = OmegaConf.merge(cfg, arg_cfg, cli_conf)
    cfg = merge_without_none(cfg, arg_cfg)

    return cfg


================================================
FILE: projects/uncleaned_train/motionrep/utils/dct.py
================================================
"""
Code from https://github.com/zh217/torch-dct/blob/master/torch_dct/_dct.py
"""
import numpy as np
import torch
import torch.nn as nn


import torch.fft


def dct1_rfft_impl(x):
    return torch.view_as_real(torch.fft.rfft(x, dim=1))


def dct_fft_impl(v):
    return torch.view_as_real(torch.fft.fft(v, dim=1))


def idct_irfft_impl(V):
    return torch.fft.irfft(torch.view_as_complex(V), n=V.shape[1], dim=1)


def dct(x, norm=None):
    """
    Discrete Cosine Transform, Type II (a.k.a. the DCT)

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    if norm is None:
              N-1
    y[k] = 2* sum x[n]*cos(pi*k*(2n+1)/(2*N)), 0 <= k < N.
              n=0

    :param x: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last dimension
    """
    x_shape = x.shape
    N = x_shape[-1]
    x = x.contiguous().view(-1, N)

    v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1)

    Vc = dct_fft_impl(v)

    k = -torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N)
    W_r = torch.cos(k)
    W_i = torch.sin(k)

    V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i

    if norm == "ortho":
        V[:, 0] /= np.sqrt(N) * 2
        V[:, 1:] /= np.sqrt(N / 2) * 2

    V = 2 * V.view(*x_shape)

    return V


def idct(X, norm=None):
    """
    The inverse to DCT-II, which is a scaled Discrete Cosine Transform, Type III

    Our definition of idct is that idct(dct(x)) == x

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param X: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the inverse DCT-II of the signal over the last dimension
    """

    x_shape = X.shape
    N = x_shape[-1]

    X_v = X.contiguous().view(-1, x_shape[-1]) / 2

    if norm == "ortho":
        X_v[:, 0] *= np.sqrt(N) * 2
        X_v[:, 1:] *= np.sqrt(N / 2) * 2

    k = (
        torch.arange(x_shape[-1], dtype=X.dtype, device=X.device)[None, :]
        * np.pi
        / (2 * N)
    )
    W_r = torch.cos(k)
    W_i = torch.sin(k)

    V_t_r = X_v
    V_t_i = torch.cat([X_v[:, :1] * 0, -X_v.flip([1])[:, :-1]], dim=1)

    V_r = V_t_r * W_r - V_t_i * W_i
    V_i = V_t_r * W_i + V_t_i * W_r

    V = torch.cat([V_r.unsqueeze(2), V_i.unsqueeze(2)], dim=2)

    v = idct_irfft_impl(V)
    x = v.new_zeros(v.shape)
    x[:, ::2] += v[:, : N - (N // 2)]
    x[:, 1::2] += v.flip([1])[:, : N // 2]

    return x.view(*x_shape)


def dct_3d(x, norm=None):
    """
    3-dimentional Discrete Cosine Transform, Type II (a.k.a. the DCT)

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param x: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last 3 dimensions
    """
    X1 = dct(x, norm=norm)
    X2 = dct(X1.transpose(-1, -2), norm=norm)
    X3 = dct(X2.transpose(-1, -3), norm=norm)
    return X3.transpose(-1, -3).transpose(-1, -2)


def idct_3d(X, norm=None):
    """
    The inverse to 3D DCT-II, which is a scaled Discrete Cosine Transform, Type III

    Our definition of idct is that idct_3d(dct_3d(x)) == x

    For the meaning of the parameter `norm`, see:
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html

    :param X: the input signal
    :param norm: the normalization, None or 'ortho'
    :return: the DCT-II of the signal over the last 3 dimensions
    """
    x1 = idct(X, norm=norm)
    x2 = idct(x1.transpose(-1, -2), norm=norm)
    x3 = idct(x2.transpose(-1, -3), norm=norm)
    return x3.transpose(-1, -3).transpose(-1, -2)


def code_test_dct3d():
    # init a tensor of shape [100, 20, 3]
    x = torch.rand(100, 20, 3)

    dct_coef = dct_3d(x, norm="ortho")
    print("inp signal shape: ", x.shape, "  dct coef shape: ", dct_coef.shape)

    x_recon = idct_3d(dct_coef, norm="ortho")
    print("inp signal shape: ", x.shape, "  recon signal shape: ", x_recon.shape)

    print("max error: ", torch.max(torch.abs(x - x_recon)))

    dct_coef[:, 0, :] = 0

    x_recon = idct_3d(dct_coef, norm="ortho")
    print("max error after removing first order: ", torch.max(torch.abs(x - x_recon)))


def unwarp_phase(phase, frequency_array):
    phase_lambda = torch.pi / frequency_array

    phase = phase + phase_lambda

    num_unwarp = phase // (2.0 * phase_lambda)
    phase = phase - num_unwarp * phase_lambda * 2.0

    phase = phase - phase_lambda

    return phase


def get_mag_phase(fft_weights, s=3.0 / 16.0):
    """
    Args:
        fft_weights: [*bs, numK * 2, 3/2] # [B**, numK, 2]
    Returns:
        mag_phase: [*bs, numK * 2, 3/2]
    """

    num_K = fft_weights.shape[-2] // 2

    # [num_k, 1]
    k_list = torch.arange(1, num_K + 1, device=fft_weights.device).unsqueeze(-1)
    # k_list = torch.ones_like(k_list) # need to fix this
    k_list = torch.pi * 2 * k_list * s

    _t_shape = fft_weights.shape[:-2] + (num_K, 1)
    k_list.expand(_t_shape)  # [B**, numK, 1]

    # [*bs, numK, 3/2]
    a, b = torch.split(fft_weights, num_K, dim=-2)

    # [B**, numK, 3/2]
    mag = torch.sqrt(a**2 + b**2 + 1e-10)

    sin_k_theta = -1.0 * b / (mag.detach())  # Do I need to detach?
    cos_k_theta = a / (mag.detach())  # Do I need to detach here?

    # [-pi, pi]
    k_theta = torch.atan2(sin_k_theta, cos_k_theta)
    theta = k_theta / k_list

    # [B**, numK * 2, 3/2]
    mag_phase = torch.cat([mag, theta], dim=-2)

    return mag_phase


def get_fft_from_mag_phase(mag_phase, s=3.0 / 16.0):
    """
    Args:
        mag_phase: [*bs, numK * 2, 3/2] # [B**, numK, 2]
    Returns:
        fft_weights: [*bs, numK * 2, 3/2]
    """

    num_K = mag_phase.shape[-2] // 2

    k_list = torch.arange(1, num_K + 1, device=mag_phase.device).unsqueeze(-1)
    # k_list = torch.ones_like(k_list) # need to fix this
    k_list = torch.pi * 2 * k_list * s  # scale to get frequency

    _t_shape = mag_phase.shape[:-2] + (num_K, 1)
    k_list.expand(_t_shape)  # [B**, numK, 1]

    # [*bs, numK, 3/2]
    mag, phase = torch.split(mag_phase, num_K, dim=-2)

    theta = phase * k_list

    a = mag * torch.cos(theta)
    b = -1.0 * mag * torch.sin(theta)

    # [B**, numK * 2, 3/2]
    fft_weights = torch.cat([a, b], dim=-2)

    return fft_weights


def get_displacements_from_fft_coeffs(fft_coe, t, s=3.0 / 16.0):
    """
    Args:
        fft_coe: [*bs, numK * 2, 3/2]
        t: [*bs, 1]

    Returns:
        disp = a * cos(freq * t) - b * sin(freq * t).
            Note that some formulation use
            disp = a * cos(freq * t) + b * sin(freq * t)
        shape of disp: [*bs, 3/2]
    """
    num_K = fft_coe.shape[-2] // 2
    k_list = torch.arange(1, num_K + 1, device=fft_coe.device)
    # [num_K, 1]
    freq_array = (torch.pi * 2 * k_list * s).unsqueeze(-1)

    # expand front dims to match t
    _tmp_shape = t.shape[:-1] + freq_array.shape
    freq_array.expand(_tmp_shape)  # [*bs, num_K, 1]

    cos_ = torch.cos(freq_array * t.unsqueeze(-2))
    sin_ = -1.0 * torch.sin(freq_array * t.unsqueeze(-2))

    # [*bs, num_K * 2] => [*bs, num_K]
    basis = torch.cat([cos_, sin_], dim=-2).squeeze(dim=-1)  #

    # [*bs, num_K * 2, 3/2] => [*bs, 3/2]
    disp = (basis.unsqueeze(-1) * fft_coe).sum(dim=-2)

    return disp


def bandpass_filter(signal: torch.Tensor, low_cutoff, high_cutoff, fs: int):
    """
    Args:
        signal: [T, ...]
        low_cutoff: float
        high_cutoff: float
        fs: int
    """
    # Apply FFT
    fft_signal = torch.fft.fft(signal, dim=0)
    freq = torch.fft.fftfreq(signal.size(0), d=1 / fs)

    # Bandpass filter
    mask = (freq <= low_cutoff) | (freq >= high_cutoff)
    fft_signal[mask] = 0

    # Apply inverse FFT
    filtered_signal = torch.fft.ifft(fft_signal, dim=0)
    return filtered_signal.real


def bandpass_filter_numpy(signal: np.ndarray, low_cutoff, high_cutoff, fs):
    # Apply FFT
    fft_signal = np.fft.fft(signal, axis=0)
    freq = np.fft.fftfreq(signal.shape[0], d=1 / fs)

    # Bandpass filter
    fft_signal[(freq <= low_cutoff) | (freq >= high_cutoff)] = 0

    # Apply inverse FFT
    filtered_signal = np.fft.ifft(fft_signal, axis=0)
    return filtered_signal.real


if __name__ == "__main__":
    code_test_dct3d()


================================================
FILE: projects/uncleaned_train/motionrep/utils/flow_utils.py
================================================

import numpy as np

def flow_to_image(flow, display=False):
    """
    Convert flow into middlebury color code image
    :param flow: optical flow map
    :return: optical flow image in middlebury color
    """
    UNKNOWN_FLOW_THRESH = 100
    u = flow[:, :, 0]
    v = flow[:, :, 1]

    maxu = -999.
    maxv = -999.
    minu = 999.
    minv = 999.

    idxUnknow = (abs(u) > UNKNOWN_FLOW_THRESH) | (abs(v) > UNKNOWN_FLOW_THRESH)
    u[idxUnknow] = 0
    v[idxUnknow] = 0

    maxu = max(maxu, np.max(u))
    minu = min(minu, np.min(u))

    maxv = max(maxv, np.max(v))
    minv = min(minv, np.min(v))

    # sqrt_rad = u**2 + v**2
    rad = np.sqrt(u**2 + v**2)

    maxrad = max(-1, np.max(rad))

    if display:
        print("max flow: %.4f\nflow range:\nu = %.3f .. %.3f\nv = %.3f .. %.3f" % (maxrad, minu,maxu, minv, maxv))

    u = u/(maxrad + np.finfo(float).eps)
    v = v/(maxrad + np.finfo(float).eps)

    img = compute_color(u, v)

    idx = np.repeat(idxUnknow[:, :, np.newaxis], 3, axis=2)
    img[idx] = 0

    return np.uint8(img)


def make_color_wheel():
    """
    Generate color wheel according Middlebury color code
    :return: Color wheel
    """
    RY = 15
    YG = 6
    GC = 4
    CB = 11
    BM = 13
    MR = 6

    ncols = RY + YG + GC + CB + BM + MR

    colorwheel = np.zeros([ncols, 3])

    col = 0

    # RY
    colorwheel[0:RY, 0] = 255
    colorwheel[0:RY, 1] = np.transpose(np.floor(255*np.arange(0, RY) / RY))
    col += RY

    # YG
    colorwheel[col:col+YG, 0] = 255 - np.transpose(np.floor(255*np.arange(0, YG) / YG))
    colorwheel[col:col+YG, 1] = 255
    col += YG

    # GC
    colorwheel[col:col+GC, 1] = 255
    colorwheel[col:col+GC, 2] = np.transpose(np.floor(255*np.arange(0, GC) / GC))
    col += GC

    # CB
    colorwheel[col:col+CB, 1] = 255 - np.transpose(np.floor(255*np.arange(0, CB) / CB))
    colorwheel[col:col+CB, 2] = 255
    col += CB

    # BM
    colorwheel[col:col+BM, 2] = 255
    colorwheel[col:col+BM, 0] = np.transpose(np.floor(255*np.arange(0, BM) / BM))
    col += + BM

    # MR
    colorwheel[col:col+MR, 2] = 255 - np.transpose(np.floor(255 * np.arange(0, MR) / MR))
    colorwheel[col:col+MR, 0] = 255

    return colorwheel


def compute_color(u, v):
    """
    compute optical flow color map
    :param u: optical flow horizontal map
    :param v: optical flow vertical map
    :return: optical flow in color code
    """
    [h, w] = u.shape
    img = np.zeros([h, w, 3])
    nanIdx = np.isnan(u) | np.isnan(v)
    u[nanIdx] = 0
    v[nanIdx] = 0

    colorwheel = make_color_wheel()
    ncols = np.size(colorwheel, 0)

    rad = np.sqrt(u**2+v**2)

    a = np.arctan2(-v, -u) / np.pi

    fk = (a+1) / 2 * (ncols - 1) + 1

    k0 = np.floor(fk).astype(int)

    k1 = k0 + 1
    k1[k1 == ncols+1] = 1
    f = fk - k0

    for i in range(0, np.size(colorwheel,1)):
        tmp = colorwheel[:, i]
        col0 = tmp[k0-1] / 255
        col1 = tmp[k1-1] / 255
        col = (1-f) * col0 + f * col1

        idx = rad <= 1
        col[idx] = 1-rad[idx]*(1-col[idx])
        notidx = np.logical_not(idx)

        col[notidx] *= 0.75
        img[:, :, i] = np.uint8(np.floor(255 * col*(1-nanIdx)))

    return img

================================================
FILE: projects/uncleaned_train/motionrep/utils/img_utils.py
================================================
import torch
import torchvision
import cv2
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from math import exp


def make_grid(imgs: torch.Tensor, scale=0.5):
    """
    Args:
        imgs: [B, C, H, W] in [0, 1]
    Output:
        x row of images, and 3 x column of images
        which means 3 x ^ 2 <= B

        img_grid: np.ndarray, [H', W', C]
    """

    B, C, H, W = imgs.shape

    num_row = int(np.sqrt(B / 3))
    if num_row < 1:
        num_row = 1
    num_col = int(np.ceil(B / num_row))

    img_grid = torchvision.utils.make_grid(imgs, nrow=num_col, padding=0)

    img_grid = img_grid.permute(1, 2, 0).cpu().numpy()

    # resize by scale
    img_grid = cv2.resize(img_grid, None, fx=scale, fy=scale)
    return img_grid


def compute_psnr(img1, img2, mask=None):
    """
    Args:
        img1: [B, C, H, W]
        img2: [B, C, H, W]
        mask: [B, 1, H, W] or [1, 1, H, W] or None
    Outs:
        psnr: [B]
    """
    # batch dim is preserved
    if mask is None:
        mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True)
    else:
        if mask.shape[0] != img1.shape[0]:
            mask = mask.repeat(img1.shape[0], 1, 1, 1)
        if mask.shape[1] != img1.shape[1]:
            mask = mask.repeat(1, img1.shape[1], 1, 1)

        diff = ((img1 - img2)) ** 2
        diff = diff * mask
        mse = diff.view(img1.shape[0], -1).sum(1, keepdim=True) / (
            mask.view(img1.shape[0], -1).sum(1, keepdim=True) + 1e-8
        )

    return 20 * torch.log10(1.0 / torch.sqrt(mse))


def torch_rgb_to_gray(image):
    # image is [B, C, H, W]
    gray_image = (
        0.299 * image[:, 0, :, :]
        + 0.587 * image[:, 1, :, :]
        + 0.114 * image[:, 2, :, :]
    )
    gray_image = gray_image.unsqueeze(1)

    return gray_image


def compute_gradient_loss(pred, gt, mask=None):
    """
    Args:
        pred: [B, C, H, W]
        gt: [B, C, H, W]
        mask: [B, 1, H, W] or None
    """
    assert pred.shape == gt.shape, "a and b must have the same shape"

    pred = torch_rgb_to_gray(pred)
    gt = torch_rgb_to_gray(gt)

    sobel_kernel_x = torch.tensor(
        [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=pred.dtype, device=pred.device
    )
    sobel_kernel_y = torch.tensor(
        [[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=pred.dtype, device=pred.device
    )

    gradient_a_x = (
        torch.nn.functional.conv2d(
            pred.repeat(1, 3, 1, 1),
            sobel_kernel_x.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    gradient_a_y = (
        torch.nn.functional.conv2d(
            pred.repeat(1, 3, 1, 1),
            sobel_kernel_y.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    # gradient_a_magnitude = torch.sqrt(gradient_a_x ** 2 + gradient_a_y ** 2)

    gradient_b_x = (
        torch.nn.functional.conv2d(
            gt.repeat(1, 3, 1, 1),
            sobel_kernel_x.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    gradient_b_y = (
        torch.nn.functional.conv2d(
            gt.repeat(1, 3, 1, 1),
            sobel_kernel_y.unsqueeze(0).unsqueeze(0).repeat(1, 3, 1, 1),
            padding=1,
        )
        / 3
    )
    # gradient_b_magnitude = torch.sqrt(gradient_b_x ** 2 + gradient_b_y ** 2)

    pred_grad = torch.cat([gradient_a_x, gradient_a_y], dim=1)
    gt_grad = torch.cat([gradient_b_x, gradient_b_y], dim=1)

    if mask is None:
        gradient_difference = torch.abs(pred_grad - gt_grad).mean()
    else:
        gradient_difference = torch.abs(pred_grad - gt_grad).mean(dim=1, keepdim=True)[
            mask
        ].sum() / (mask.sum() + 1e-8)

    return gradient_difference


def mark_image_with_red_squares(img):
    # img, torch.Tensor of shape [B, H, W, C]

    mark_color = torch.tensor([1.0, 0, 0], dtype=torch.float32)

    for x_offset in range(4):
        for y_offset in range(4):
            img[:, x_offset::16, y_offset::16, :] = mark_color

    return img


# below for compute batched SSIM
def gaussian(window_size, sigma):

    gauss = torch.Tensor(
        [
            exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2))
            for x in range(window_size)
        ]
    )
    return gauss / gauss.sum()


def create_window(window_size, channel):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = Variable(
        _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    )
    return window


def compute_ssim(img1, img2, window_size=11, size_average=True):
    channel = img1.size(-3)
    window = create_window(window_size, channel)

    if img1.is_cuda:
        window = window.cuda(img1.get_device())
    window = window.type_as(img1)

    return _ssim(img1, img2, window, window_size, channel, size_average)


def _ssim(img1, img2, window, window_size, channel, size_average=True):
    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = (
        F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
    )
    sigma2_sq = (
        F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
    )
    sigma12 = (
        F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
        - mu1_mu2
    )

    C1 = 0.01**2
    C2 = 0.03**2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
        (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
    )

    if size_average:
        return ssim_map.mean()
    else:
        return ssim_map.mean(1).mean(1).mean(1)


# above for compute batched SSIM


def compute_low_res_psnr(img1, img2, scale_factor):
    """
    Args:
        img1: [B, C, H, W]
        img2: [B, C, H, W]
        scale_factor: int
    """
    img1 = F.interpolate(
        img1, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    img2 = F.interpolate(
        img2, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    return compute_psnr(img1, img2)


def compute_low_res_mse(img1, img2, scale_factor):
    """
    Args:
        img1: [B, C, H, W]
        img2: [B, C, H, W]
        scale_factor: int
    """
    img1 = F.interpolate(
        img1, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    img2 = F.interpolate(
        img2, scale_factor=1 / scale_factor, mode="bilinear", align_corners=False
    )
    loss_mse = F.mse_loss(img1, img2, reduction="mean")
    return loss_mse


================================================
FILE: projects/uncleaned_train/motionrep/utils/io_utils.py
================================================
import cv2
import imageio
import numpy as np
import mediapy
import os
import PIL


def read_video_cv2(video_path, rgb=True):
    """Read video using cv2, return [T, 3, H, W] array, fps"""

    # BGR
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    num_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    ret_list = []
    for i in range(num_frame):
        ret, frame = cap.read()
        if ret:
            if rgb:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = np.transpose(frame, [2, 0, 1])  # [3, H, W]
            ret_list.append(frame[np.newaxis, ...])
        else:
            break
    cap.release()
    ret_array = np.concatenate(ret_list, axis=0)  # [T, 3, H, W]
    return ret_array, fps


def save_video_cv2(video_path, img_list, fps):
    # BGR

    if len(img_list) == 0:
        return
    h, w = img_list[0].shape[:2]
    fourcc = cv2.VideoWriter_fourcc(
        *"mp4v"
    )  # cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
    writer = cv2.VideoWriter(video_path, fourcc, fps, (w, h))

    for frame in img_list:
        writer.write(frame)
    writer.release()


def save_video_imageio(video_path, img_list, fps):
    """
    Img_list: [[H, W, 3]]
    """
    if len(img_list) == 0:
        return
    writer = imageio.get_writer(video_path, fps=fps)
    for frame in img_list:
        writer.append_data(frame)

    writer.close()


def save_gif_imageio(video_path, img_list, fps):
    """
    Img_list: [[H, W, 3]]
    """
    if len(img_list) == 0:
        return
    assert video_path.endswith(".gif")

    imageio.mimsave(video_path, img_list, format="GIF", fps=fps)


def save_video_mediapy(video_frames, output_video_path: str = None, fps: int = 14):
    # video_frames: [N, H, W, 3]
    if isinstance(video_frames[0], PIL.Image.Image):
        video_frames = [np.array(frame) for frame in video_frames]
    os.makedirs(os.path.dirname(output_video_path), exist_ok=True)
    mediapy.write_video(output_video_path, video_frames, fps=fps, qp=18)


================================================
FILE: projects/uncleaned_train/motionrep/utils/optimizer.py
================================================
import torch
from torch.optim.lr_scheduler import LambdaLR


def get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps, last_epoch=-1
):
    """
    From diffusers.optimization
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0,
            float(num_training_steps - current_step)
            / float(max(1, num_training_steps - num_warmup_steps)),
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)


================================================
FILE: projects/uncleaned_train/motionrep/utils/peft_utils.py
================================================
import peft

from peft.utils.save_and_load import get_peft_model_state_dict
from peft import PeftModel


def save_peft_adaptor(model: peft.PeftModel, dir, save_base_model=False):
    # save the adaptor only
    model.save_pretrained(dir)

    if save_base_model:
        raise NotImplementedError


def load_peft_adaptor_and_merge(adaptor_path, base_model):
    model = PeftModel.from_pretrained(base_model, adaptor_path)
    model = model.merge_and_unload()

    return model


def _code_test_peft_load_save():
    import torch.nn as nn
    import torch
    import copy

    class MLP(nn.Module):
        def __init__(self, num_units_hidden=10):
            super().__init__()
            self.seq = nn.Sequential(
                nn.Linear(20, num_units_hidden),
                nn.ReLU(),
                nn.Linear(num_units_hidden, num_units_hidden),
                nn.ReLU(),
                nn.Linear(num_units_hidden, 2),
                nn.LogSoftmax(dim=-1),
            )

        def forward(self, X):
            return self.seq(X)

    module = MLP()
    print("=> Name of original model parameters:")
    for name, param in module.named_parameters():
        print(name, param.shape)
    module_copy = copy.deepcopy(module)
    config = peft.LoraConfig(
        r=8,
        target_modules=["seq.0", "seq.2"],
        modules_to_save=["seq.4"],
    )
    peft_model = peft.get_peft_model(module, config)

    peft_model.print_trainable_parameters()

    print("\n=> Name of PeftModel's parameters:")
    for name, param in peft_model.named_parameters():
        print(name, param.shape)

    save_path = "./tmp"

    save_peft_adaptor(peft_model, save_path)

    loaded_merged_model = load_peft_adaptor_and_merge(save_path, module_copy)

    print("\n=> Name of Loaded and Merged model's parameters:")
    for name, param in loaded_merged_model.named_parameters():
        print(name, param.shape)


if __name__ == "__main__":
    _code_test_peft_load_save()


================================================
FILE: projects/uncleaned_train/motionrep/utils/print_utils.py
================================================
import torch.distributed as dist


def print_if_zero_rank(s):
    if (not dist.is_initialized()) and (dist.is_initialized() and dist.get_rank() == 0):
        print("### " + s)


================================================
FILE: projects/uncleaned_train/motionrep/utils/pytorch_mssim.py
================================================
import torch
import torch.nn.functional as F
from math import exp
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def gaussian(window_size, sigma):
    gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
    return gauss/gauss.sum()


def create_window(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0).to(device)
    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    return window

def create_window_3d(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t())
    _3D_window = _2D_window.unsqueeze(2) @ (_1D_window.t())
    window = _3D_window.expand(1, channel, window_size, window_size, window_size).contiguous().to(device)
    return window


def ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
    if val_range is None:
        if torch.max(img1) > 128:
            max_val = 255
        else:
            max_val = 1

        if torch.min(img1) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range

    padd = 0
    (_, channel, height, width) = img1.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window(real_size, channel=channel).to(img1.device)
    
    # mu1 = F.conv2d(img1, window, padding=padd, groups=channel)
    # mu2 = F.conv2d(img2, window, padding=padd, groups=channel)
    mu1 = F.conv2d(F.pad(img1, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)
    mu2 = F.conv2d(F.pad(img2, (5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(F.pad(img1 * img1, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(F.pad(img2 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu2_sq
    sigma12 = F.conv2d(F.pad(img1 * img2, (5, 5, 5, 5), 'replicate'), window, padding=padd, groups=channel) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)  # contrast sensitivity

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean()
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret


def ssim_matlab(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
    """
    Args:
        img1, img2: (N, C, H, W)
    """
    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
    if val_range is None:
        if torch.max(img1) > 128:
            max_val = 255
        else:
            max_val = 1

        if torch.min(img1) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range

    padd = 0
    (_, _, height, width) = img1.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window_3d(real_size, channel=1).to(img1.device)
        # Channel is set to 1 since we consider color images as volumetric images

    img1 = img1.unsqueeze(1)
    img2 = img2.unsqueeze(1)

    mu1 = F.conv3d(F.pad(img1, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)
    mu2 = F.conv3d(F.pad(img2, (5, 5, 5, 5, 5, 5), mode='replicate'), window, padding=padd, groups=1)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv3d(F.pad(img1 * img1, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_sq
    sigma2_sq = F.conv3d(F.pad(img2 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu2_sq
    sigma12 = F.conv3d(F.pad(img1 * img2, (5, 5, 5, 5, 5, 5), 'replicate'), window, padding=padd, groups=1) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)  # contrast sensitivity

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean()
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret


def msssim(img1, img2, window_size=11, size_average=True, val_range=None, normalize=False):
    device = img1.device
    weights = torch.FloatTensor([0.0448, 0.2856, 0.3001, 0.2363, 0.1333]).to(device)
    levels = weights.size()[0]
    mssim = []
    mcs = []
    for _ in range(levels):
        sim, cs = ssim(img1, img2, window_size=window_size, size_average=size_average, full=True, val_range=val_range)
        mssim.append(sim)
        mcs.append(cs)

        img1 = F.avg_pool2d(img1, (2, 2))
        img2 = F.avg_pool2d(img2, (2, 2))

    mssim = torch.stack(mssim)
    mcs = torch.stack(mcs)

    # Normalize (to avoid NaNs during training unstable models, not compliant with original definition)
    if normalize:
        mssim = (mssim + 1) / 2
        mcs = (mcs + 1) / 2

    pow1 = mcs ** weights
    pow2 = mssim ** weights
    # From Matlab implementation https://ece.uwaterloo.ca/~z70wang/research/iwssim/
    output = torch.prod(pow1[:-1] * pow2[-1])
    return output


# Classes to re-use window
class SSIM(torch.nn.Module):
    def __init__(self, window_size=11, size_average=True, val_range=None):
        super(SSIM, self).__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.val_range = val_range

        # Assume 3 channel for SSIM
        self.channel = 3
        self.window = create_window(window_size, channel=self.channel)

    def forward(self, img1, img2):
        (_, channel, _, _) = img1.size()

        if channel == self.channel and self.window.dtype == img1.dtype:
            window = self.window
        else:
            window = create_window(self.window_size, channel).to(img1.device).type(img1.dtype)
            self.window = window
            self.channel = channel

        _ssim = ssim(img1, img2, window=window, window_size=self.window_size, size_average=self.size_average)
        dssim = (1 - _ssim) / 2
        return dssim

class MSSSIM(torch.nn.Module):
    def __init__(self, window_size=11, size_average=True, channel=3):
        super(MSSSIM, self).__init__()
        self.window_size = window_size
        self.size_average = size_average
        self.channel = channel

    def forward(self, img1, img2):
        return msssim(img1, img2, window_size=self.window_size, size_average=self.size_average)

================================================
FILE: projects/uncleaned_train/motionrep/utils/svd_helpper.py
================================================
from glob import glob
from sys import version
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import torch
import os

from omegaconf import ListConfig, OmegaConf
from safetensors.torch import load_file as load_safetensors

from sgm.inference.helpers import embed_watermark
from sgm.modules.diffusionmodules.guiders import LinearPredictionGuider, VanillaCFG
from sgm.util import append_dims, default, instantiate_from_config
import math
from einops import repeat


def init_st(version_dict, load_ckpt=True, load_filter=True):
    state = dict()
    if not "model" in state:
        config = version_dict["config"]
        ckpt = version_dict["ckpt"]

        config = OmegaConf.load(config)
        model, msg = load_model_from_config(config, ckpt if load_ckpt else None)

        state["msg"] = msg
        state["model"] = model
        state["ckpt"] = ckpt if load_ckpt else None
        state["config"] = config
        if load_filter:
            return state
            # from scripts.util.detection.nsfw_and_watermark_dectection import DeepFloydDataFiltering
            state["filter"] = DeepFloydDataFiltering(verbose=False)
    return state


def load_model_from_config(config, ckpt=None, verbose=True):
    model = instantiate_from_config(config.model)

    if ckpt is not None:
        print(f"Loading model from {ckpt}")
        if ckpt.endswith("ckpt"):
            pl_sd = torch.load(ckpt, map_location="cpu")
            if "global_step" in pl_sd:
                global_step = pl_sd["global_step"]
                print(f"Global Step: {pl_sd['global_step']}")
            sd = pl_sd["state_dict"]
        elif ckpt.endswith("safetensors"):
            sd = load_safetensors(ckpt)
        else:
            raise NotImplementedError

        msg = None

        m, u = model.load_state_dict(sd, strict=False)

        if len(m) > 0 and verbose:
            print("missing keys:")
            print(m)
        if len(u) > 0 and verbose:
            print("unexpected keys:")
            print(u)
    else:
        msg = None

    model = initial_model_load(model)
    # model.eval()  # ?
    return model, msg


def load_model(model):
    model.cuda()


lowvram_mode = False


def set_lowvram_mode(mode):
    global lowvram_mode
    lowvram_mode = mode


def initial_model_load(model):
    global lowvram_mode
    if lowvram_mode:
        model.model.half()
    else:
        model.cuda()
    return model


def unload_model(model):
    global lowvram_mode
    if lowvram_mode:
        model.cpu()
        torch.cuda.empty_cache()


def get_unique_embedder_keys_from_conditioner(conditioner):
    return list(set([x.input_key for x in conditioner.embedders]))


def get_batch(keys, value_dict, N, T, device):
    batch = {}
    batch_uc = {}

    for key in keys:
        if key == "fps_id":
            batch[key] = (
                torch.tensor([value_dict["fps_id"]])
                .to(device)
                .repeat(int(math.prod(N)))
            )
        elif key == "motion_bucket_id":
            batch[key] = (
                torch.tensor([value_dict["motion_bucket_id"]])
                .to(device)
                .repeat(int(math.prod(N)))
            )
        elif key == "cond_aug":
            batch[key] = repeat(
                torch.tensor([value_dict["cond_aug"]]).to(device),
                "1 -> b",
                b=math.prod(N),
            )
        elif key == "cond_frames":
            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
        elif key == "cond_frames_without_noise":
            batch[key] = repeat(
                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
            )
        else:
            batch[key] = value_dict[key]

    if T is not None:
        batch["num_video_frames"] = T

    for key in batch.keys():
        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
            batch_uc[key] = torch.clone(batch[key])
    return batch, batch_uc


if __name__ == "__main__":
    pass


================================================
FILE: projects/uncleaned_train/motionrep/utils/torch_utils.py
================================================
import torch
import time


def get_sync_time():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    return time.time()


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/convert_gaussian_to_mesh.py
================================================
import os
from random import gauss
from fire import Fire
from motionrep.gaussian_3d.scene import GaussianModel
import numpy as np
import torch


def convert_gaussian_to_mesh(gaussian_path, save_path=None):
    if save_path is None:
        dir_path = os.path.dirname(gaussian_path)
        save_path = os.path.join(dir_path, "gaussian_to_mesh.obj")

    gaussian_path = os.path.join(gaussian_path)

    gaussians = GaussianModel(3)

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()
    print(
        "load gaussians from: {}".format(gaussian_path),
        "... num gaussians: ",
        gaussians._xyz.shape[0],
    )

    mesh = gaussians.extract_mesh(
        save_path, density_thresh=1, resolution=128, decimate_target=1e5
    )

    mesh.write(save_path)


def internal_filling(gaussian_path, save_path=None, resolution=64):
    if save_path is None:
        dir_path = os.path.dirname(gaussian_path)
        save_path = os.path.join(dir_path, "gaussian_internal_fill.ply")

    gaussians = GaussianModel(3)

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()

    print(
        "load gaussians from: {}".format(gaussian_path),
        "... num gaussians: ",
        gaussians._xyz.shape[0],
    )

    # [res, res, res]
    occ = (
        gaussians.extract_fields(resolution=resolution, num_blocks=16, relax_ratio=1.5)
        .detach()
        .cpu()
        .numpy()
    )

    xyzs = gaussians._xyz.detach().cpu().numpy()

    center = gaussians.center.detach().cpu().numpy()
    scale = gaussians.scale # float
    xyzs = (xyzs - center) * scale # [-1.5, 1.5]?

    percentile = [82, 84, 86][1]

    # from IPython import embed
    # embed()

    thres = np.percentile(occ, percentile)
    print("density threshold: {:.2f} -- in percentile: {:.1f} ".format(thres, percentile))
    occ_large_thres = occ > thres
    # get the xyz of the occupied voxels
    # xyz = np.argwhere(occ)
    # normalize to [-1, 1]
    # xyz = xyz / (resolution - 1) * 2 - 1

    voxel_counts = np.zeros((resolution, resolution, resolution))

    points_xyzindex = ((xyzs + 1) / 2 * (resolution - 1)).astype(np.uint32)

    for x, y, z in points_xyzindex:
        voxel_counts[x, y, z] += 1
    
    add_points = np.logical_and(occ_large_thres, voxel_counts <= 1)

    add_xyz = np.argwhere(add_points).astype(np.float32)
    add_xyz = add_xyz / (resolution - 1) * 2 - 1

    all_xyz = np.concatenate([xyzs, add_xyz], axis=0)

    print("added points: ", add_xyz.shape[0])
    
    # save to ply
    import point_cloud_utils as pcu

    pcu.save_mesh_vf(save_path, all_xyz, np.zeros((0, 3), dtype=np.int32))

    add_path = os.path.join(os.path.dirname(save_path), "extra_filled_points.ply")
    pcu.save_mesh_vf(add_path, add_xyz, np.zeros((0, 3), dtype=np.int32))


if __name__ == "__main__":
    # Fire(convert_gaussian_to_mesh)

    Fire(internal_filling)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/diff_warp_utils.py
================================================
import warp as wp
import warp.torch
import torch
from typing import Optional, Union, Sequence, Any
from torch import Tensor
from warp_rewrite import from_torch_safe


@wp.struct
class MPMStateStruct(object):
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_init_cov: wp.array(dtype=float)  # initial covariance matrix
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_R: wp.array(dtype=wp.mat33)  # rotation matrix
    particle_stress: wp.array(dtype=wp.mat33)  # Kirchoff stress, elastic stress
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume
    particle_mass: wp.array(dtype=float)  # mass
    particle_density: wp.array(dtype=float)  # density
    particle_Jp: wp.array(dtype=float)

    particle_selection: wp.array(
        dtype=int
    )  # only particle_selection[p] = 0 will be simulated

    # grid
    grid_m: wp.array(dtype=float, ndim=3)
    grid_v_in: wp.array(dtype=wp.vec3, ndim=3)  # grid node momentum/velocity
    grid_v_out: wp.array(
        dtype=wp.vec3, ndim=3
    )  # grid node momentum/velocity, after grid update

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        # shape default is int. number of particles
        self.particle_x = wp.empty(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_v = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_F = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_init_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=False
        )
        self.particle_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=False
        )

        self.particle_F_trial = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_R = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_stress = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_C = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_vol = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_mass = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_density = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_Jp = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.particle_selection = wp.zeros(
            shape, dtype=int, device=device, requires_grad=requires_grad
        )

        # grid: will init later
        self.grid_m = wp.empty(
            (10, 10, 10), dtype=float, device=device, requires_grad=requires_grad
        )
        self.grid_v_in = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.grid_v_out = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )

    def init_grid(
        self, grid_res: int, device: wp.context.Devicelike = None, requires_grad=False
    ):
        self.grid_m = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=float,
            device=device,
            requires_grad=False,
        )
        self.grid_v_in = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )
        self.grid_v_out = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )

    def from_torch(
        self,
        tensor_x: Tensor,
        tensor_volume: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        n_grid: int = 100,
        grid_lim=1.0,
        device="cuda:0",
        requires_grad=True,
    ):
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]
        self.init_grid(grid_res=n_grid, device=device, requires_grad=requires_grad)

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_volume is not None:
            print(self.particle_vol.shape, tensor_volume.shape)
            volume_numpy = tensor_volume.detach().cpu().numpy()
            self.particle_vol = wp.from_numpy(
                volume_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )
            self.particle_init_cov = self.particle_cov

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", n_particles)

    def reset_state(
        self,
        tensor_x: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
    ):
        # reset p_c, p_v, p_C, p_F_trial
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]

        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )
            self.particle_cov = self.particle_init_cov

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_C],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_stress],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_R],
            device=device,
        )
    def set_require_grad(self, requires_grad=True):
        self.particle_x.requires_grad = requires_grad
        self.particle_v.requires_grad = requires_grad
        self.particle_F.requires_grad = requires_grad
        self.particle_F_trial.requires_grad = requires_grad
        self.particle_stress.requires_grad = requires_grad

        self.grid_v_out.requires_grad = requires_grad
        self.grid_v_in.requires_grad = requires_grad


@wp.struct
class ParticleStateStruct(object):
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_init_cov: wp.array(dtype=float)  # initial covariance matrix
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume

    particle_selection: wp.array(
        dtype=int
    )  # only particle_selection[p] = 0 will be simulated

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        # shape default is int. number of particles
        self.particle_x = wp.empty(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_v = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_F = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_init_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=requires_grad
        )
        self.particle_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=requires_grad
        )

        self.particle_F_trial = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_stress = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_C = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_vol = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.particle_selection = wp.zeros(
            shape, dtype=int, device=device, requires_grad=requires_grad
        )

    def from_torch(
        self,
        tensor_x: Tensor,
        tensor_volume: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        n_grid: int = 100,
        grid_lim=1.0,
        device="cuda:0",
        requires_grad=True,
    ):
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]

        if tensor_x is not None:
            # print(self.particle_x.shape, tensor_x.shape)
            # print(tensor_x.grad)
            if tensor_x.requires_grad:
                # tensor_x.grad = torch.zeros_like(tensor_x, requires_grad=False)
                raise RuntimeError("tensor_x requires grad")

            # x_numpy = tensor_x.detach().clone().cpu().numpy()
            # self.particle_x = wp.from_numpy(x_numpy, dtype=wp.vec3, requires_grad=True, device=device)
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_volume is not None:
            print(self.particle_vol.shape, tensor_volume.shape)
            volume_numpy = tensor_volume.detach().cpu().numpy()
            # self.particle_vol = wp.from_torch(tensor_volume.contiguous(), dtype=float, device=device, requires_grad=requires_grad)
            # self.particle_vol = wp.from_torch(tensor_volume.contiguous(), dtype=float, requires_grad=False)
            self.particle_vol = wp.from_numpy(
                volume_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )
            self.particle_cov = self.particle_init_cov

        if tensor_velocity is not None:
            if tensor_velocity.requires_grad:
                tensor_velocity.grad = torch.zeros_like(
                    tensor_velocity, requires_grad=False
                )
                raise RuntimeError("tensor_x requires grad")
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", n_particles)

    def set_require_grad(self, requires_grad=True):
        self.particle_x.requires_grad = requires_grad
        self.particle_v.requires_grad = requires_grad
        self.particle_F.requires_grad = requires_grad
        self.particle_F_trial.requires_grad = requires_grad
        self.particle_stress.requires_grad = requires_grad


@wp.struct
class MPMModelStruct(object):
    ####### essential #######
    grid_lim: float
    n_particles: int
    n_grid: int
    dx: float
    inv_dx: float
    grid_dim_x: int
    grid_dim_y: int
    grid_dim_z: int
    mu: wp.array(dtype=float)
    lam: wp.array(dtype=float)
    E: wp.array(dtype=float)
    nu: wp.array(dtype=float)
    material: int

    ######## for plasticity ####
    yield_stress: wp.array(dtype=float)
    friction_angle: float
    alpha: float
    gravitational_accelaration: wp.vec3
    hardening: float
    xi: float
    plastic_viscosity: float
    softening: float

    ####### for damping
    rpic_damping: float
    grid_v_damping_scale: float

    ####### for PhysGaussian: covariance
    update_cov_with_F: int

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        self.E = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # young's modulus
        self.nu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # poisson's ratio

        self.mu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )
        self.lam = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.yield_stress = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

    def finalize_mu_lam(self, n_particles, device="cuda:0"):
        wp.launch(
            kernel=compute_mu_lam_from_E_nu_clean,
            dim=n_particles,
            inputs=[self.mu, self.lam, self.E, self.nu],
            device=device,
        )

    def init_other_params(self, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.grid_lim = grid_lim
        self.n_grid = n_grid
        self.grid_dim_x = n_grid
        self.grid_dim_y = n_grid
        self.grid_dim_z = n_grid
        (
            self.dx,
            self.inv_dx,
        ) = self.grid_lim / self.n_grid, float(
            n_grid / grid_lim
        )  # [0-1]?

        self.update_cov_with_F = False

        # material is used to switch between different elastoplastic models. 0 is jelly
        self.material = 0

        self.plastic_viscosity = 0.0
        self.softening = 0.1
        self.friction_angle = 25.0
        sin_phi = wp.sin(self.friction_angle / 180.0 * 3.14159265)
        self.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        self.gravitational_accelaration = wp.vec3(0.0, 0.0, 0.0)

        self.rpic_damping = 0.0  # 0.0 if no damping (apic). -1 if pic

        self.grid_v_damping_scale = 1.1  # globally applied

    def from_torch(
        self, tensor_E: Tensor, tensor_nu: Tensor, device="cuda:0", requires_grad=False
    ):
        self.E = wp.from_torch(tensor_E.contiguous(), requires_grad=requires_grad)
        self.nu = wp.from_torch(tensor_nu.contiguous(), requires_grad=requires_grad)
        n_particles = tensor_E.shape[0]
        self.finalize_mu_lam(n_particles=n_particles, device=device)

    def set_require_grad(self, requires_grad=True):
        self.E.requires_grad = requires_grad
        self.nu.requires_grad = requires_grad
        self.mu.requires_grad = requires_grad
        self.lam.requires_grad = requires_grad


# for various boundary conditions
@wp.struct
class Dirichlet_collider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float

    friction: float
    surface_type: int

    velocity: wp.vec3

    threshold: float
    reset: int
    index: int

    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    height: float
    length: float
    R: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3
    half_height_and_radius: wp.vec2


@wp.struct
class Impulse_modifier:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    force: wp.vec3
    forceTimesDt: wp.vec3
    numsteps: int

    point: wp.vec3
    size: wp.vec3
    mask: wp.array(dtype=int)


@wp.struct
class MPMtailoredStruct:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    friction: float
    surface_type: int
    velocity: wp.vec3
    threshold: float
    reset: int

    point_rotate: wp.vec3
    normal_rotate: wp.vec3
    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    point_plane: wp.vec3
    normal_plane: wp.vec3
    velocity_plane: wp.vec3
    threshold_plane: float


@wp.struct
class MaterialParamsModifier:
    point: wp.vec3
    size: wp.vec3
    E: float
    nu: float
    density: float


@wp.struct
class ParticleVelocityModifier:
    point: wp.vec3
    normal: wp.vec3
    half_height_and_radius: wp.vec2
    rotation_scale: float
    translation_scale: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3

    start_time: float

    end_time: float

    velocity: wp.vec3

    mask: wp.array(dtype=int)


@wp.kernel
def compute_mu_lam_from_E_nu_clean(
    mu: wp.array(dtype=float),
    lam: wp.array(dtype=float),
    E: wp.array(dtype=float),
    nu: wp.array(dtype=float),
):
    p = wp.tid()
    mu[p] = E[p] / (2.0 * (1.0 + nu[p]))
    lam[p] = E[p] * nu[p] / ((1.0 + nu[p]) * (1.0 - 2.0 * nu[p]))


@wp.kernel
def set_vec3_to_zero(target_array: wp.array(dtype=wp.vec3)):
    tid = wp.tid()
    target_array[tid] = wp.vec3(0.0, 0.0, 0.0)


@wp.kernel
def set_mat33_to_identity(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)


@wp.kernel
def set_mat33_to_zero(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


@wp.kernel
def add_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.add(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def subtract_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.sub(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def add_vec3_to_vec3(
    first_array: wp.array(dtype=wp.vec3), second_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    first_array[tid] = wp.add(first_array[tid], second_array[tid])


@wp.kernel
def set_value_to_float_array(target_array: wp.array(dtype=float), value: float):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def set_warpvalue_to_float_array(
    target_array: wp.array(dtype=float), value: warp.types.float32
):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def get_float_array_product(
    arrayA: wp.array(dtype=float),
    arrayB: wp.array(dtype=float),
    arrayC: wp.array(dtype=float),
):
    tid = wp.tid()
    arrayC[tid] = arrayA[tid] * arrayB[tid]


def torch2warp_quat(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 4
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.quat,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_float(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=warp.types.float32,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_vec3(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.vec3,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_mat33(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.mat33,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/engine_utils.py
================================================
import numpy as np
import h5py
import os
import sys
import warp as wp
import torch

def save_data_at_frame(mpm_solver, dir_name, frame, save_to_ply = True, save_to_h5 = False):
    os.umask(0)
    os.makedirs(dir_name, 0o777, exist_ok=True)
    
    fullfilename = dir_name + '/sim_' + str(frame).zfill(10) + '.h5'

    if save_to_ply:
        particle_position_to_ply(mpm_solver, fullfilename[:-2]+'ply')
    
    if save_to_h5:

        if os.path.exists(fullfilename): os.remove(fullfilename)
        newFile = h5py.File(fullfilename, "w")

        x_np = mpm_solver.mpm_state.particle_x.numpy().transpose() # x_np has shape (3, n_particles)
        newFile.create_dataset("x", data=x_np) # position

        currentTime = np.array([mpm_solver.time]).reshape(1,1)
        newFile.create_dataset("time", data=currentTime) # current time

        f_tensor_np = mpm_solver.mpm_state.particle_F.numpy().reshape(-1,9).transpose() # shape = (9, n_particles)
        newFile.create_dataset("f_tensor", data=f_tensor_np) # deformation grad

        v_np = mpm_solver.mpm_state.particle_v.numpy().transpose() # v_np has shape (3, n_particles)
        newFile.create_dataset("v", data=v_np) # particle velocity

        C_np = mpm_solver.mpm_state.particle_C.numpy().reshape(-1,9).transpose() # shape = (9, n_particles)
        newFile.create_dataset("C", data=C_np) # particle C
        print("save siumlation data at frame ", frame, " to ", fullfilename)

def particle_position_to_ply(mpm_solver, filename):
    # position is (n,3)
    if os.path.exists(filename):
        os.remove(filename)
    position = mpm_solver.mpm_state.particle_x.numpy()
    num_particles = (position).shape[0]
    position = position.astype(np.float32)
    with open(filename, 'wb') as f: # write binary
        header = f"""ply
format binary_little_endian 1.0
element vertex {num_particles}
property float x
property float y
property float z
end_header
"""
        f.write(str.encode(header))
        f.write(position.tobytes())
        print("write", filename)

def particle_position_tensor_to_ply(position_tensor, filename):
    # position is (n,3)
    if os.path.exists(filename):
        os.remove(filename)
    position = position_tensor.clone().detach().cpu().numpy()
    num_particles = (position).shape[0]
    position = position.astype(np.float32)
    with open(filename, 'wb') as f: # write binary
        header = f"""ply
format binary_little_endian 1.0
element vertex {num_particles}
property float x
property float y
property float z
end_header
"""
        f.write(str.encode(header))
        f.write(position.tobytes())
        print("write", filename)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/grad_test.py
================================================
import warp as wp
import numpy as np
import torch
import os
from mpm_solver_warp_diff import MPM_Simulator_WARPDiff
from run_gaussian_static import load_gaussians, get_volume
from tqdm import tqdm
from fire import Fire

from diff_warp_utils import MPMStateStruct, MPMModelStruct
from warp_rewrite import MyTape

from mpm_utils import *


def test(input_dir, output_dir=None, fps=6, device=0):
    wp.init()
    wp.config.verify_cuda = True

    device = "cuda:{}".format(device)

    gaussian_dict, scale, shift = load_gaussians(input_dir)

    velocity_scaling = 0.5
    init_velocity = velocity_scaling * gaussian_dict["velocity"]
    init_position = gaussian_dict["position"]
    init_cov = gaussian_dict["cov"]

    volume_array_path = os.path.join(input_dir, "volume_array.npy")
    if os.path.exists(volume_array_path):
        volume_array = np.load(volume_array_path)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)
    else:
        volume_array = get_volume(init_position)
        np.save(volume_array_path, volume_array)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)

    tensor_init_pos = torch.from_numpy(init_position).float().to(device)
    tensor_init_cov = torch.from_numpy(init_cov).float().to(device)
    tensor_init_velocity = torch.from_numpy(init_velocity).float().to(device)

    # set boundary conditions
    static_center_point = (
        torch.from_numpy(gaussian_dict["satic_center_point"]).float().to(device)
    )
    max_static_offset = (
        torch.from_numpy(gaussian_dict["max_static_offset"]).float().to(device)
    )
    velocity = torch.zeros_like(static_center_point)
    # mpm_solver.enforce_particle_velocity_translation(static_center_point, max_static_offset, velocity,
    #                                                  start_time=0, end_time=1000, device=device)

    material_params = {
        "E": 2.0,  # 0.1-200 MPa
        "nu": 0.1,  # > 0.35
        "material": "jelly",
        # "material": "metal",
        # "friction_angle": 25,
        "g": [0.0, 0.0, 0],
        "density": 0.02,  # kg / m^3
    }

    n_particles = tensor_init_pos.shape[0]
    mpm_state = MPMStateStruct()

    mpm_state.init(init_position.shape[0], device=device, requires_grad=True)
    mpm_state.from_torch(
        tensor_init_pos,
        volume_tensor,
        tensor_init_cov,
        tensor_init_velocity,
        device=device,
        requires_grad=True,
        n_grid=100,
        grid_lim=1.0,
    )
    mpm_state.set_require_grad(True)

    next_mpm_state = MPMStateStruct()
    next_mpm_state.init(init_position.shape[0], device=device, requires_grad=True)
    next_mpm_state.from_torch(
        tensor_init_pos.clone(),
        volume_tensor.clone(),
        tensor_init_cov.clone(),
        tensor_init_velocity.clone(),
        device=device,
        requires_grad=True,
        n_grid=100,
        grid_lim=1.0,
    )
    next_mpm_state.set_require_grad(True)
    # mpm_state.grid_v_out = wp.from_numpy(
    #     np.ones((100, 100, 100, 3)), dtype=wp.vec3, requires_grad=True, device=device
    # )

    # tensor_init_pos.requires_grad = True
    # tensor_init_cov.requires_grad = False
    # tensor_init_velocity.requires_grad = True

    # mpm_state.particle_x = wp.from_torch(tensor_init_pos, requires_grad=True)
    # mpm_state.particle_x = wp.from_numpy(init_position, dtype=wp.vec3, requires_grad=True, device=device)
    # mpm_state.particle_v = wp.from_numpy(init_velocity, dtype=wp.vec3, requires_grad=True, device=device)
    # mpm_state.particle_vol = wp.from_numpy(volume_array, dtype=float, requires_grad=False, device=device)

    mpm_model = MPMModelStruct()
    mpm_model.init(n_particles, device=device, requires_grad=True)
    mpm_model.init_other_params(n_grid=100, grid_lim=1.0, device=device)

    E_tensor = (torch.ones(velocity.shape[0]) * 2.0).contiguous().to(device)
    nu_tensor = (torch.ones(velocity.shape[0]) * 0.1).contiguous().to(device)
    # E_warp = wp.from_torch(E_tensor, requires_grad=True)
    # nu_warp = wp.from_torch(nu_tensor, requires_grad=True)

    mpm_model.from_torch(E_tensor, nu_tensor, device=device, requires_grad=True)

    total_time = 0.1
    time_step = 0.01
    total_iters = int(total_time / time_step)
    total_iters = 3
    loss = torch.zeros(1, device=device)
    loss = wp.from_torch(loss, requires_grad=True)

    dt = time_step
    tape = MyTape()  # wp.Tape()

    with tape:
        # for k in tqdm(range(1, total_iters)):
        k = 1
        # mpm_solver.p2g2p(k, time_step, device=device)
        for i in range(3):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(100),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

            wp.launch(
                kernel=g2p_test,
                dim=n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        wp.launch(
            position_loss_kernel,
            dim=n_particles,
            inputs=[mpm_state, loss],
            device=device,
        )

    print(loss, "pre backward")

    tape.backward(loss)  # 75120.86

    print(loss)

    v_grad = mpm_state.particle_v.grad
    x_grad = mpm_state.particle_x.grad
    grid_v_grad = mpm_state.grid_v_out.grad
    grid_v_in_grad = mpm_state.grid_v_in.grad
    print(x_grad)
    from IPython import embed

    embed()


@wp.kernel
def position_loss_kernel(mpm_state: MPMStateStruct, loss: wp.array(dtype=float)):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    wp.atomic_add(loss, 0, pos[0] + pos[1] + pos[2])
    # wp.atomic_add(loss, 0, mpm_state.particle_x[tid][0])


@wp.kernel
def g2p_test(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))

        # new_v = wp.vec3(0.0, 0.0, 0.0)
        # new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_v = wp.vec3(0.0)
        new_C = wp.mat33(new_v, new_v, new_v)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = new_v + grid_v * weight
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        state.particle_v[p] = new_v
        # wp.atomic_add(state.particle_x, p, dt * state.particle_v[p])
        wp.atomic_add(state.particle_x, p, dt * new_v)

        # might add clip here https://github.com/PingchuanMa/NCLaw/blob/main/nclaw/sim/mpm.py
        state.particle_C[p] = new_C
        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        state.particle_F_trial[p] = F_tmp

        # next_state.particle_v[p] = new_v
        # next_state.particle_C[p] = new_C
        # next_state.particle_F_trial[p] = F_tmp
        # wp.atomic_add(next_state.particle_x, p, dt * new_v)

        if model.update_cov_with_F:
            pass
            # update_cov(next_state, p, new_F, dt)


if __name__ == "__main__":
    Fire(test)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/mpm_solver_warp.py
================================================
import sys
import os

import warp as wp

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from engine_utils import *
from warp_utils import *
from mpm_utils import *


class MPM_Simulator_WARP:
    def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.initialize(n_particles, n_grid, grid_lim, device=device)
        self.time_profile = {}

    def initialize(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.n_particles = n_particles

        self.mpm_model = MPMModelStruct()
        # domain will be [0,grid_lim]*[0,grid_lim]*[0,grid_lim] !!!
        # domain will be [0,grid_lim]*[0,grid_lim]*[0,grid_lim] !!!
        # domain will be [0,grid_lim]*[0,grid_lim]*[0,grid_lim] !!!
        self.mpm_model.grid_lim = grid_lim
        self.mpm_model.n_grid = n_grid
        self.mpm_model.grid_dim_x = self.mpm_model.n_grid
        self.mpm_model.grid_dim_y = self.mpm_model.n_grid
        self.mpm_model.grid_dim_z = self.mpm_model.n_grid
        (
            self.mpm_model.dx,
            self.mpm_model.inv_dx,
        ) = self.mpm_model.grid_lim / self.mpm_model.n_grid, float(
            self.mpm_model.n_grid / self.mpm_model.grid_lim
        )

        self.mpm_model.E = wp.zeros(shape=n_particles, dtype=float, device=device)
        self.mpm_model.nu = wp.zeros(shape=n_particles, dtype=float, device=device)
        self.mpm_model.mu = wp.zeros(shape=n_particles, dtype=float, device=device)
        self.mpm_model.lam = wp.zeros(shape=n_particles, dtype=float, device=device)

        self.mpm_model.update_cov_with_F = False

        # material is used to switch between different elastoplastic models. 0 is jelly
        self.mpm_model.material = 0

        self.mpm_model.plastic_viscosity = 0.0
        self.mpm_model.softening = 0.1
        self.mpm_model.yield_stress = wp.zeros(
            shape=n_particles, dtype=float, device=device
        )
        self.mpm_model.friction_angle = 25.0
        sin_phi = wp.sin(self.mpm_model.friction_angle / 180.0 * 3.14159265)
        self.mpm_model.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        self.mpm_model.gravitational_accelaration = wp.vec3(0.0, 0.0, 0.0)

        self.mpm_model.rpic_damping = 0.0  # 0.0 if no damping (apic). -1 if pic

        self.mpm_model.grid_v_damping_scale = 1.1  # globally applied

        self.mpm_state = MPMStateStruct()

        self.mpm_state.particle_x = wp.empty(
            shape=n_particles, dtype=wp.vec3, device=device
        )  # current position

        self.mpm_state.particle_v = wp.zeros(
            shape=n_particles, dtype=wp.vec3, device=device
        )  # particle velocity

        self.mpm_state.particle_F = wp.zeros(
            shape=n_particles, dtype=wp.mat33, device=device
        )  # particle F elastic

        self.mpm_state.particle_R = wp.zeros(
            shape=n_particles, dtype=wp.mat33, device=device
        )  # particle R rotation

        self.mpm_state.particle_init_cov = wp.zeros(
            shape=n_particles * 6, dtype=float, device=device
        )  # initial covariance matrix

        self.mpm_state.particle_cov = wp.zeros(
            shape=n_particles * 6, dtype=float, device=device
        )  # current covariance matrix

        self.mpm_state.particle_F_trial = wp.zeros(
            shape=n_particles, dtype=wp.mat33, device=device
        )  # apply return mapping will yield

        self.mpm_state.particle_stress = wp.zeros(
            shape=n_particles, dtype=wp.mat33, device=device
        )

        self.mpm_state.particle_vol = wp.zeros(
            shape=n_particles, dtype=float, device=device
        )  # particle volume
        self.mpm_state.particle_mass = wp.zeros(
            shape=n_particles, dtype=float, device=device
        )  # particle mass
        self.mpm_state.particle_density = wp.zeros(
            shape=n_particles, dtype=float, device=device
        )
        self.mpm_state.particle_C = wp.zeros(
            shape=n_particles, dtype=wp.mat33, device=device
        )
        self.mpm_state.particle_Jp = wp.zeros(
            shape=n_particles, dtype=float, device=device
        )

        self.mpm_state.particle_selection = wp.zeros(
            shape=n_particles, dtype=int, device=device
        )

        self.mpm_state.grid_m = wp.zeros(
            shape=(self.mpm_model.n_grid, self.mpm_model.n_grid, self.mpm_model.n_grid),
            dtype=float,
            device=device,
        )
        self.mpm_state.grid_v_in = wp.zeros(
            shape=(self.mpm_model.n_grid, self.mpm_model.n_grid, self.mpm_model.n_grid),
            dtype=wp.vec3,
            device=device,
        )
        self.mpm_state.grid_v_out = wp.zeros(
            shape=(self.mpm_model.n_grid, self.mpm_model.n_grid, self.mpm_model.n_grid),
            dtype=wp.vec3,
            device=device,
        )

        self.time = 0.0

        self.grid_postprocess = []
        self.collider_params = []
        self.modify_bc = []

        self.tailored_struct_for_bc = MPMtailoredStruct()
        self.pre_p2g_operations = []
        self.impulse_params = []

        self.particle_velocity_modifiers = []
        self.particle_velocity_modifier_params = []

    # the h5 file should store particle initial position and volume.
    def load_from_sampling(
        self, sampling_h5, n_grid=100, grid_lim=1.0, device="cuda:0"
    ):
        if not os.path.exists(sampling_h5):
            print("h5 file cannot be found at ", os.getcwd() + sampling_h5)
            exit()

        h5file = h5py.File(sampling_h5, "r")
        x, particle_volume = h5file["x"], h5file["particle_volume"]

        x = x[()].transpose()  # np vector of x # shape now is (n_particles, dim)

        self.dim, self.n_particles = x.shape[1], x.shape[0]

        self.initialize(self.n_particles, n_grid, grid_lim, device=device)

        print(
            "Sampling particles are loaded from h5 file. Simulator is re-initialized for the correct n_particles"
        )
        particle_volume = np.squeeze(particle_volume, 0)

        self.mpm_state.particle_x = wp.from_numpy(
            x, dtype=wp.vec3, device=device
        )  # initialize warp array from np

        # initial velocity is default to zero
        wp.launch(
            kernel=set_vec3_to_zero,
            dim=self.n_particles,
            inputs=[self.mpm_state.particle_v],
            device=device,
        )
        # initial velocity is default to zero

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=self.n_particles,
            inputs=[self.mpm_state.particle_F_trial],
            device=device,
        )
        # initial deformation gradient is set to identity

        self.mpm_state.particle_vol = wp.from_numpy(
            particle_volume, dtype=float, device=device
        )

        print("Particles initialized from sampling file.")
        print("Total particles: ", self.n_particles)

    # shape of tensor_x is (n, 3); shape of tensor_volume is (n,)
    def load_initial_data_from_torch(
        self,
        tensor_x,
        tensor_volume,
        tensor_cov=None,
        tensor_velocity=None,
        n_grid=100,
        grid_lim=1.0,
        device="cuda:0",
    ):
        self.dim, self.n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]
        self.initialize(self.n_particles, n_grid, grid_lim, device=device)

        self.import_particle_x_from_torch(tensor_x, device)
        self.mpm_state.particle_vol = wp.from_numpy(
            tensor_volume.detach().clone().cpu().numpy(), dtype=float, device=device
        )
        if tensor_cov is not None:
            self.mpm_state.particle_init_cov = wp.from_numpy(
                tensor_cov.reshape(-1).detach().clone().cpu().numpy(),
                dtype=float,
                device=device,
            )

            if self.mpm_model.update_cov_with_F:
                self.mpm_state.particle_cov = self.mpm_state.particle_init_cov

        # initial velocity is default to zero
        wp.launch(
            kernel=set_vec3_to_zero,
            dim=self.n_particles,
            inputs=[self.mpm_state.particle_v],
            device=device,
        )
        if tensor_velocity is not None:
            warp_velocity = torch2warp_vec3(
                tensor_velocity.detach().clone(), dvc=device
            )
            self.mpm_state.particle_v = warp_velocity

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=self.n_particles,
            inputs=[self.mpm_state.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", self.n_particles)

    # must give density. mass will be updated as density * volume
    def set_parameters(self, device="cuda:0", **kwargs):
        self.set_parameters_dict(device, kwargs)

    def set_parameters_dict(self, kwargs={}, device="cuda:0"):
        if "material" in kwargs:
            if kwargs["material"] == "jelly":
                self.mpm_model.material = 0
            elif kwargs["material"] == "metal":
                self.mpm_model.material = 1
            elif kwargs["material"] == "sand":
                self.mpm_model.material = 2
            elif kwargs["material"] == "foam":
                self.mpm_model.material = 3
            elif kwargs["material"] == "snow":
                self.mpm_model.material = 4
            elif kwargs["material"] == "plasticine":
                self.mpm_model.material = 5
            else:
                raise TypeError("Undefined material type")

        if "grid_lim" in kwargs:
            self.mpm_model.grid_lim = kwargs["grid_lim"]
        if "n_grid" in kwargs:
            self.mpm_model.n_grid = kwargs["n_grid"]
        self.mpm_model.grid_dim_x = self.mpm_model.n_grid
        self.mpm_model.grid_dim_y = self.mpm_model.n_grid
        self.mpm_model.grid_dim_z = self.mpm_model.n_grid
        (
            self.mpm_model.dx,
            self.mpm_model.inv_dx,
        ) = self.mpm_model.grid_lim / self.mpm_model.n_grid, float(
            self.mpm_model.n_grid / self.mpm_model.grid_lim
        )
        self.mpm_state.grid_m = wp.zeros(
            shape=(self.mpm_model.n_grid, self.mpm_model.n_grid, self.mpm_model.n_grid),
            dtype=float,
            device=device,
        )
        self.mpm_state.grid_v_in = wp.zeros(
            shape=(self.mpm_model.n_grid, self.mpm_model.n_grid, self.mpm_model.n_grid),
            dtype=wp.vec3,
            device=device,
        )
        self.mpm_state.grid_v_out = wp.zeros(
            shape=(self.mpm_model.n_grid, self.mpm_model.n_grid, self.mpm_model.n_grid),
            dtype=wp.vec3,
            device=device,
        )

        if "E" in kwargs:
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[self.mpm_model.E, kwargs["E"]],
                device=device,
            )
        if "nu" in kwargs:
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[self.mpm_model.nu, kwargs["nu"]],
                device=device,
            )
        if "yield_stress" in kwargs:
            val = kwargs["yield_stress"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[self.mpm_model.yield_stress, val],
                device=device,
            )
        if "hardening" in kwargs:
            self.mpm_model.hardening = kwargs["hardening"]
        if "xi" in kwargs:
            self.mpm_model.xi = kwargs["xi"]
        if "friction_angle" in kwargs:
            self.mpm_model.friction_angle = kwargs["friction_angle"]
            sin_phi = wp.sin(self.mpm_model.friction_angle / 180.0 * 3.14159265)
            self.mpm_model.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        if "g" in kwargs:
            self.mpm_model.gravitational_accelaration = wp.vec3(
                kwargs["g"][0], kwargs["g"][1], kwargs["g"][2]
            )

        if "density" in kwargs:
            density_value = kwargs["density"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[self.mpm_state.particle_density, density_value],
                device=device,
            )
            wp.launch(
                kernel=get_float_array_product,
                dim=self.n_particles,
                inputs=[
                    self.mpm_state.particle_density,
                    self.mpm_state.particle_vol,
                    self.mpm_state.particle_mass,
                ],
                device=device,
            )
        if "rpic_damping" in kwargs:
            self.mpm_model.rpic_damping = kwargs["rpic_damping"]
        if "plastic_viscosity" in kwargs:
            self.mpm_model.plastic_viscosity = kwargs["plastic_viscosity"]
        if "softening" in kwargs:
            self.mpm_model.softening = kwargs["softening"]
        if "grid_v_damping_scale" in kwargs:
            self.mpm_model.grid_v_damping_scale = kwargs["grid_v_damping_scale"]

        if "additional_material_params" in kwargs:
            for params in kwargs["additional_material_params"]:
                param_modifier = MaterialParamsModifier()
                param_modifier.point = wp.vec3(params["point"])
                param_modifier.size = wp.vec3(params["size"])
                param_modifier.density = params["density"]
                param_modifier.E = params["E"]
                param_modifier.nu = params["nu"]
                wp.launch(
                    kernel=apply_additional_params,
                    dim=self.n_particles,
                    inputs=[self.mpm_state, self.mpm_model, param_modifier],
                    device=device,
                )

            wp.launch(
                kernel=get_float_array_product,
                dim=self.n_particles,
                inputs=[
                    self.mpm_state.particle_density,
                    self.mpm_state.particle_vol,
                    self.mpm_state.particle_mass,
                ],
                device=device,
            )

    def finalize_mu_lam(self, device="cuda:0"):
        wp.launch(
            kernel=compute_mu_lam_from_E_nu,
            dim=self.n_particles,
            inputs=[self.mpm_state, self.mpm_model],
            device=device,
        )

    def p2g2p(self, step, dt, device="cuda:0"):
        grid_size = (
            self.mpm_model.grid_dim_x,
            self.mpm_model.grid_dim_y,
            self.mpm_model.grid_dim_z,
        )
        wp.launch(
            kernel=zero_grid,
            dim=(grid_size),
            inputs=[self.mpm_state, self.mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, self.mpm_state, self.impulse_params[k]],
                device=device,
            )
        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    self.mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[self.mpm_state, self.mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[self.mpm_state, self.mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[self.mpm_state, self.mpm_model, dt],
                device=device,
            )

        if self.mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[self.mpm_state, self.mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        self.mpm_state,
                        self.mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p,
                dim=self.n_particles,
                inputs=[self.mpm_state, self.mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        #### CFL check ####
        # particle_v = self.mpm_state.particle_v.numpy()
        # if np.max(np.abs(particle_v)) > self.mpm_model.dx / dt:
        #     print("max particle v: ", np.max(np.abs(particle_v)))
        #     print("max allowed  v: ", self.mpm_model.dx / dt)
        #     print("does not allow v*dt>dx")
        #     input()
        #### CFL check ####
        self.time = self.time + dt

    # set particle densities to all_particle_densities,
    def reset_densities_and_update_masses(
        self, all_particle_densities, device="cuda:0"
    ):
        all_particle_densities = all_particle_densities.clone().detach()
        self.mpm_state.particle_density = torch2warp_float(
            all_particle_densities, dvc=device
        )
        wp.launch(
            kernel=get_float_array_product,
            dim=self.n_particles,
            inputs=[
                self.mpm_state.particle_density,
                self.mpm_state.particle_vol,
                self.mpm_state.particle_mass,
            ],
            device=device,
        )

    # clone = True makes a copy, not necessarily needed
    def import_particle_x_from_torch(self, tensor_x, clone=True, device="cuda:0"):
        if tensor_x is not None:
            if clone:
                tensor_x = tensor_x.clone().detach()
            self.mpm_state.particle_x = torch2warp_vec3(tensor_x, dvc=device)

    # clone = True makes a copy, not necessarily needed
    def import_particle_v_from_torch(self, tensor_v, clone=True, device="cuda:0"):
        if tensor_v is not None:
            if clone:
                tensor_v = tensor_v.clone().detach()
            self.mpm_state.particle_v = torch2warp_vec3(tensor_v, dvc=device)

    # clone = True makes a copy, not necessarily needed
    def import_particle_F_from_torch(self, tensor_F, clone=True, device="cuda:0"):
        if tensor_F is not None:
            if clone:
                tensor_F = tensor_F.clone().detach()
            tensor_F = torch.reshape(tensor_F, (-1, 3, 3))  # arranged by rowmajor
            self.mpm_state.particle_F = torch2warp_mat33(tensor_F, dvc=device)

    # clone = True makes a copy, not necessarily needed
    def import_particle_C_from_torch(self, tensor_C, clone=True, device="cuda:0"):
        if tensor_C is not None:
            if clone:
                tensor_C = tensor_C.clone().detach()
            tensor_C = torch.reshape(tensor_C, (-1, 3, 3))  # arranged by rowmajor
            self.mpm_state.particle_C = torch2warp_mat33(tensor_C, dvc=device)

    def export_particle_x_to_torch(self):
        return wp.to_torch(self.mpm_state.particle_x)

    def export_particle_v_to_torch(self):
        return wp.to_torch(self.mpm_state.particle_v)

    def export_particle_F_to_torch(self):
        F_tensor = wp.to_torch(self.mpm_state.particle_F)
        F_tensor = F_tensor.reshape(-1, 9)
        return F_tensor

    def export_particle_R_to_torch(self, device="cuda:0"):
        with wp.ScopedTimer(
            "compute_R_from_F",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_R_from_F,
                dim=self.n_particles,
                inputs=[self.mpm_state, self.mpm_model],
                device=device,
            )

        R_tensor = wp.to_torch(self.mpm_state.particle_R)
        R_tensor = R_tensor.reshape(-1, 9)
        return R_tensor

    def export_particle_C_to_torch(self):
        C_tensor = wp.to_torch(self.mpm_state.particle_C)
        C_tensor = C_tensor.reshape(-1, 9)
        return C_tensor

    def export_particle_cov_to_torch(self, device="cuda:0"):
        if not self.mpm_model.update_cov_with_F:
            with wp.ScopedTimer(
                "compute_cov_from_F",
                synchronize=True,
                print=False,
                dict=self.time_profile,
            ):
                wp.launch(
                    kernel=compute_cov_from_F,
                    dim=self.n_particles,
                    inputs=[self.mpm_state, self.mpm_model],
                    device=device,
                )

        cov = wp.to_torch(self.mpm_state.particle_cov)
        return cov

    def print_time_profile(self):
        print("MPM Time profile:")
        for key, value in self.time_profile.items():
            print(key, sum(value))

    # a surface specified by a point and the normal vector
    def add_surface_collider(
        self,
        point,
        normal,
        surface="sticky",
        friction=0.0,
        start_time=0.0,
        end_time=999.0,
    ):
        point = list(point)
        # Normalize normal
        normal_scale = 1.0 / wp.sqrt(float(sum(x**2 for x in normal)))
        normal = list(normal_scale * x for x in normal)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.normal = wp.vec3(normal[0], normal[1], normal[2])

        if surface == "sticky" and friction != 0:
            raise ValueError("friction must be 0 on sticky surfaces.")
        if surface == "sticky":
            collider_param.surface_type = 0
        elif surface == "slip":
            collider_param.surface_type = 1
        elif surface == "cut":
            collider_param.surface_type = 11
        else:
            collider_param.surface_type = 2
        # frictional
        collider_param.friction = friction

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                n = wp.vec3(param.normal[0], param.normal[1], param.normal[2])
                dotproduct = wp.dot(offset, n)

                if dotproduct < 0.0:
                    if param.surface_type == 0:
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )
                    elif param.surface_type == 11:
                        if (
                            float(grid_z) * model.dx < 0.4
                            or float(grid_z) * model.dx > 0.53
                        ):
                            state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                                0.0, 0.0, 0.0
                            )
                        else:
                            v_in = state.grid_v_out[grid_x, grid_y, grid_z]
                            state.grid_v_out[grid_x, grid_y, grid_z] = (
                                wp.vec3(v_in[0], 0.0, v_in[2]) * 0.3
                            )
                    else:
                        v = state.grid_v_out[grid_x, grid_y, grid_z]
                        normal_component = wp.dot(v, n)
                        if param.surface_type == 1:
                            v = (
                                v - normal_component * n
                            )  # Project out all normal component
                        else:
                            v = (
                                v - wp.min(normal_component, 0.0) * n
                            )  # Project out only inward normal component
                        if normal_component < 0.0 and wp.length(v) > 1e-20:
                            v = wp.max(
                                0.0, wp.length(v) + normal_component * param.friction
                            ) * wp.normalize(
                                v
                            )  # apply friction here
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # a cubiod is a rectangular cube'
    # centered at `point`
    # dimension is x: point[0]±size[0]
    #              y: point[1]±size[1]
    #              z: point[2]±size[2]
    # all grid nodes lie within the cubiod will have their speed set to velocity
    # the cuboid itself is also moving with const speed = velocity
    # set the speed to zero to fix BC
    def set_velocity_on_cuboid(
        self,
        point,
        size,
        velocity,
        start_time=0.0,
        end_time=999.0,
        reset=0,
    ):
        point = list(point)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time
        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.size = size
        collider_param.velocity = wp.vec3(velocity[0], velocity[1], velocity[2])
        # collider_param.threshold = threshold
        collider_param.reset = reset
        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                if (
                    wp.abs(offset[0]) < param.size[0]
                    and wp.abs(offset[1]) < param.size[1]
                    and wp.abs(offset[2]) < param.size[2]
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = param.velocity
            elif param.reset == 1:
                if time < param.end_time + 15.0 * dt:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        def modify(time, dt, param: Dirichlet_collider):
            if time >= param.start_time and time < param.end_time:
                param.point = wp.vec3(
                    param.point[0] + dt * param.velocity[0],
                    param.point[1] + dt * param.velocity[1],
                    param.point[2] + dt * param.velocity[2],
                )  # param.point + dt * param.velocity

        self.grid_postprocess.append(collide)
        self.modify_bc.append(modify)

    def add_bounding_box(self, start_time=0.0, end_time=999.0):
        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            padding = 3
            if time >= param.start_time and time < param.end_time:
                if grid_x < padding and state.grid_v_out[grid_x, grid_y, grid_z][0] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_x >= model.grid_dim_x - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][0] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_y < padding and state.grid_v_out[grid_x, grid_y, grid_z][1] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_y >= model.grid_dim_y - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][1] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_z < padding and state.grid_v_out[grid_x, grid_y, grid_z][2] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )
                if (
                    grid_z >= model.grid_dim_z - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][2] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles(
        self,
        force,
        dt,
        point=[1, 1, 1],
        size=[1, 1, 1],
        num_dt=1,
        start_time=0.0,
        device="cuda:0",
    ):
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = start_time + dt * num_dt

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])
        impulse_param.mask = wp.zeros(shape=self.n_particles, dtype=int, device=device)

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[self.mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] == 1:
                    impulse = wp.vec3(
                        param.force[0] / state.particle_mass[p],
                        param.force[1] / state.particle_mass[p],
                        param.force[2] / state.particle_mass[p],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)

    def enforce_particle_velocity_translation(
        self, point, size, velocity, start_time, end_time, device="cuda:0"
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.size = wp.vec3(size[0], size[1], size[2])

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_translation,
            dim=self.n_particles,
            inputs=[self.mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # define a cylinder with center point, half_height, radius, normal
    # particles within the cylinder are rotating along the normal direction
    # may also have a translational velocity along the normal direction
    def enforce_particle_velocity_rotation(
        self,
        point,
        normal,
        half_height_and_radius,
        rotation_scale,
        translation_scale,
        start_time,
        end_time,
        device="cuda:0",
    ):
        normal_scale = 1.0 / wp.sqrt(
            float(normal[0] ** 2 + normal[1] ** 2 + normal[2] ** 2)
        )
        normal = list(normal_scale * x for x in normal)

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.half_height_and_radius = wp.vec2(
            half_height_and_radius[0], half_height_and_radius[1]
        )
        velocity_modifier_params.normal = wp.vec3(normal[0], normal[1], normal[2])

        horizontal_1 = wp.vec3(1.0, 1.0, 1.0)
        if wp.abs(wp.dot(velocity_modifier_params.normal, horizontal_1)) < 0.01:
            horizontal_1 = wp.vec3(0.72, 0.37, -0.67)
        horizontal_1 = (
            horizontal_1
            - wp.dot(horizontal_1, velocity_modifier_params.normal)
            * velocity_modifier_params.normal
        )
        horizontal_1 = horizontal_1 * (1.0 / wp.length(horizontal_1))
        horizontal_2 = wp.cross(horizontal_1, velocity_modifier_params.normal)

        velocity_modifier_params.horizontal_axis_1 = horizontal_1
        velocity_modifier_params.horizontal_axis_2 = horizontal_2

        velocity_modifier_params.rotation_scale = rotation_scale
        velocity_modifier_params.translation_scale = translation_scale

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_cylinder,
            dim=self.n_particles,
            inputs=[self.mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    offset = state.particle_x[p] - velocity_modifier_params.point
                    horizontal_distance = wp.length(
                        offset
                        - wp.dot(offset, velocity_modifier_params.normal)
                        * velocity_modifier_params.normal
                    )
                    cosine = (
                        wp.dot(offset, velocity_modifier_params.horizontal_axis_1)
                        / horizontal_distance
                    )
                    theta = wp.acos(cosine)
                    if wp.dot(offset, velocity_modifier_params.horizontal_axis_2) > 0:
                        theta = theta
                    else:
                        theta = -theta
                    axis1_scale = (
                        -horizontal_distance
                        * wp.sin(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis2_scale = (
                        horizontal_distance
                        * wp.cos(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis_vertical_scale = translation_scale
                    state.particle_v[p] = (
                        axis1_scale * velocity_modifier_params.horizontal_axis_1
                        + axis2_scale * velocity_modifier_params.horizontal_axis_2
                        + axis_vertical_scale * velocity_modifier_params.normal
                    )

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # given normal direction, say [0,0,1]
    # gradually release grid velocities from start position to end position
    def release_particles_sequentially(
        self, normal, start_position, end_position, num_layers, start_time, end_time
    ):
        num_layers = 50
        point = [0, 0, 0]
        size = [0, 0, 0]
        axis = -1
        for i in range(3):
            if normal[i] == 0:
                point[i] = 1
                size[i] = 1
            else:
                axis = i
                point[i] = end_position

        half_length_portion = wp.abs(start_position - end_position) / num_layers
        end_time_portion = end_time / num_layers
        for i in range(num_layers):
            size[axis] = half_length_portion * (num_layers - i)
            self.enforce_particle_velocity_translation(
                point=point,
                size=size,
                velocity=[0, 0, 0],
                start_time=start_time,
                end_time=end_time_portion * (i + 1),
            )


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/mpm_solver_warp_diff.py
================================================
import sys
import os

import warp as wp

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from engine_utils import *
from diff_warp_utils import *
from mpm_utils import *
from typing import Optional, Union, Sequence, Any


class MPM_Simulator_WARPDiff(object):
    # def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
    #     self.initialize(n_particles, n_grid, grid_lim, device=device)
    #     self.time_profile = {}

    def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.initialize(n_particles, n_grid, grid_lim, device=device)
        self.time_profile = {}

    def initialize(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.n_particles = n_particles

        self.time = 0.0

        self.grid_postprocess = []
        self.collider_params = []
        self.modify_bc = []

        self.tailored_struct_for_bc = MPMtailoredStruct()
        self.pre_p2g_operations = []
        self.impulse_params = []

        self.particle_velocity_modifiers = []
        self.particle_velocity_modifier_params = []

    # must give density. mass will be updated as density * volume
    def set_parameters(self, device="cuda:0", **kwargs):
        self.set_parameters_dict(device, kwargs)

    def set_parameters_dict(self, mpm_model, mpm_state, kwargs={}, device="cuda:0"):
        if "material" in kwargs:
            if kwargs["material"] == "jelly":
                mpm_model.material = 0
            elif kwargs["material"] == "metal":
                mpm_model.material = 1
            elif kwargs["material"] == "sand":
                mpm_model.material = 2
            elif kwargs["material"] == "foam":
                mpm_model.material = 3
            elif kwargs["material"] == "snow":
                mpm_model.material = 4
            elif kwargs["material"] == "plasticine":
                mpm_model.material = 5
            else:
                raise TypeError("Undefined material type")

        if "yield_stress" in kwargs:
            val = kwargs["yield_stress"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.yield_stress, val],
                device=device,
            )
        if "hardening" in kwargs:
            mpm_model.hardening = kwargs["hardening"]
        if "xi" in kwargs:
            mpm_model.xi = kwargs["xi"]
        if "friction_angle" in kwargs:
            mpm_model.friction_angle = kwargs["friction_angle"]
            sin_phi = wp.sin(mpm_model.friction_angle / 180.0 * 3.14159265)
            mpm_model.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        if "g" in kwargs:
            mpm_model.gravitational_accelaration = wp.vec3(
                kwargs["g"][0], kwargs["g"][1], kwargs["g"][2]
            )

        if "density" in kwargs:
            density_value = kwargs["density"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_state.particle_density, density_value],
                device=device,
            )
            wp.launch(
                kernel=get_float_array_product,
                dim=self.n_particles,
                inputs=[
                    mpm_state.particle_density,
                    mpm_state.particle_vol,
                    mpm_state.particle_mass,
                ],
                device=device,
            )
        if "rpic_damping" in kwargs:
            mpm_model.rpic_damping = kwargs["rpic_damping"]
        if "plastic_viscosity" in kwargs:
            mpm_model.plastic_viscosity = kwargs["plastic_viscosity"]
        if "softening" in kwargs:
            mpm_model.softening = kwargs["softening"]
        if "grid_v_damping_scale" in kwargs:
            mpm_model.grid_v_damping_scale = kwargs["grid_v_damping_scale"]

    def set_E_nu(self, mpm_model, E: float, nu: float, device="cuda:0"):
        
        wp.launch(
            kernel=set_value_to_float_array,
            dim=self.n_particles,
            inputs=[mpm_model.E, E],
            device=device,
        )
        wp.launch(
            kernel=set_value_to_float_array,
            dim=self.n_particles,
            inputs=[mpm_model.nu, nu],
            device=device,
        )

    def p2g2p(self, mpm_model, mpm_state, step, dt, device="cuda:0"):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        wp.launch(
            kernel=compute_mu_lam_from_E_nu,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )
        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        # apply impulse force on particles..
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, mpm_state, self.impulse_params[k]],
                device=device,
            )

        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

        if mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid, collide
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        mpm_state,
                        mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        #### CFL check ####
        # particle_v = self.mpm_state.particle_v.numpy()
        # if np.max(np.abs(particle_v)) > self.mpm_model.dx / dt:
        #     print("max particle v: ", np.max(np.abs(particle_v)))
        #     print("max allowed  v: ", self.mpm_model.dx / dt)
        #     print("does not allow v*dt>dx")
        #     input()
        #### CFL check ####
        self.time = self.time + dt

    def print_time_profile(self):
        print("MPM Time profile:")
        for key, value in self.time_profile.items():
            print(key, sum(value))

    # a surface specified by a point and the normal vector
    def add_surface_collider(
        self,
        point,
        normal,
        surface="sticky",
        friction=0.0,
        start_time=0.0,
        end_time=999.0,
    ):
        point = list(point)
        # Normalize normal
        normal_scale = 1.0 / wp.sqrt(float(sum(x**2 for x in normal)))
        normal = list(normal_scale * x for x in normal)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.normal = wp.vec3(normal[0], normal[1], normal[2])

        if surface == "sticky" and friction != 0:
            raise ValueError("friction must be 0 on sticky surfaces.")
        if surface == "sticky":
            collider_param.surface_type = 0
        elif surface == "slip":
            collider_param.surface_type = 1
        elif surface == "cut":
            collider_param.surface_type = 11
        else:
            collider_param.surface_type = 2
        # frictional
        collider_param.friction = friction

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                n = wp.vec3(param.normal[0], param.normal[1], param.normal[2])
                dotproduct = wp.dot(offset, n)

                if dotproduct < 0.0:
                    if param.surface_type == 0:
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )
                    elif param.surface_type == 11:
                        if (
                            float(grid_z) * model.dx < 0.4
                            or float(grid_z) * model.dx > 0.53
                        ):
                            state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                                0.0, 0.0, 0.0
                            )
                        else:
                            v_in = state.grid_v_out[grid_x, grid_y, grid_z]
                            state.grid_v_out[grid_x, grid_y, grid_z] = (
                                wp.vec3(v_in[0], 0.0, v_in[2]) * 0.3
                            )
                    else:
                        v = state.grid_v_out[grid_x, grid_y, grid_z]
                        normal_component = wp.dot(v, n)
                        if param.surface_type == 1:
                            v = (
                                v - normal_component * n
                            )  # Project out all normal component
                        else:
                            v = (
                                v - wp.min(normal_component, 0.0) * n
                            )  # Project out only inward normal component
                        if normal_component < 0.0 and wp.length(v) > 1e-20:
                            v = wp.max(
                                0.0, wp.length(v) + normal_component * param.friction
                            ) * wp.normalize(
                                v
                            )  # apply friction here
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # a cubiod is a rectangular cube'
    # centered at `point`
    # dimension is x: point[0]±size[0]
    #              y: point[1]±size[1]
    #              z: point[2]±size[2]
    # all grid nodes lie within the cubiod will have their speed set to velocity
    # the cuboid itself is also moving with const speed = velocity
    # set the speed to zero to fix BC
    def set_velocity_on_cuboid(
        self,
        point,
        size,
        velocity,
        start_time=0.0,
        end_time=999.0,
        reset=0,
    ):
        point = list(point)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time
        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.size = size
        collider_param.velocity = wp.vec3(velocity[0], velocity[1], velocity[2])
        # collider_param.threshold = threshold
        collider_param.reset = reset
        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                if (
                    wp.abs(offset[0]) < param.size[0]
                    and wp.abs(offset[1]) < param.size[1]
                    and wp.abs(offset[2]) < param.size[2]
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = param.velocity
            elif param.reset == 1:
                if time < param.end_time + 15.0 * dt:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        def modify(time, dt, param: Dirichlet_collider):
            if time >= param.start_time and time < param.end_time:
                param.point = wp.vec3(
                    param.point[0] + dt * param.velocity[0],
                    param.point[1] + dt * param.velocity[1],
                    param.point[2] + dt * param.velocity[2],
                )  # param.point + dt * param.velocity

        self.grid_postprocess.append(collide)
        self.modify_bc.append(modify)

    def add_bounding_box(self, start_time=0.0, end_time=999.0):
        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            padding = 3
            if time >= param.start_time and time < param.end_time:
                if grid_x < padding and state.grid_v_out[grid_x, grid_y, grid_z][0] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_x >= model.grid_dim_x - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][0] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_y < padding and state.grid_v_out[grid_x, grid_y, grid_z][1] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_y >= model.grid_dim_y - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][1] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_z < padding and state.grid_v_out[grid_x, grid_y, grid_z][2] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )
                if (
                    grid_z >= model.grid_dim_z - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][2] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles(
        self,
        mpm_state,
        force,
        dt,
        point=[1, 1, 1],
        size=[1, 1, 1],
        num_dt=1,
        start_time=0.0,
        device="cuda:0",
    ):
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = start_time + dt * num_dt

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])
        impulse_param.mask = wp.zeros(shape=self.n_particles, dtype=int, device=device)

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] == 1:
                    impulse = wp.vec3(
                        param.force[0] / state.particle_mass[p],
                        param.force[1] / state.particle_mass[p],
                        param.force[2] / state.particle_mass[p],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)

    def enforce_particle_velocity_translation(
        self, mpm_state, point, size, velocity, start_time, end_time, device="cuda:0"
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.size = wp.vec3(size[0], size[1], size[2])

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_translation,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # define a cylinder with center point, half_height, radius, normal
    # particles within the cylinder are rotating along the normal direction
    # may also have a translational velocity along the normal direction
    def enforce_particle_velocity_rotation(
        self,
        mpm_state,
        point,
        normal,
        half_height_and_radius,
        rotation_scale,
        translation_scale,
        start_time,
        end_time,
        device="cuda:0",
    ):
        normal_scale = 1.0 / wp.sqrt(
            float(normal[0] ** 2 + normal[1] ** 2 + normal[2] ** 2)
        )
        normal = list(normal_scale * x for x in normal)

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.half_height_and_radius = wp.vec2(
            half_height_and_radius[0], half_height_and_radius[1]
        )
        velocity_modifier_params.normal = wp.vec3(normal[0], normal[1], normal[2])

        horizontal_1 = wp.vec3(1.0, 1.0, 1.0)
        if wp.abs(wp.dot(velocity_modifier_params.normal, horizontal_1)) < 0.01:
            horizontal_1 = wp.vec3(0.72, 0.37, -0.67)
        horizontal_1 = (
            horizontal_1
            - wp.dot(horizontal_1, velocity_modifier_params.normal)
            * velocity_modifier_params.normal
        )
        horizontal_1 = horizontal_1 * (1.0 / wp.length(horizontal_1))
        horizontal_2 = wp.cross(horizontal_1, velocity_modifier_params.normal)

        velocity_modifier_params.horizontal_axis_1 = horizontal_1
        velocity_modifier_params.horizontal_axis_2 = horizontal_2

        velocity_modifier_params.rotation_scale = rotation_scale
        velocity_modifier_params.translation_scale = translation_scale

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_cylinder,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    offset = state.particle_x[p] - velocity_modifier_params.point
                    horizontal_distance = wp.length(
                        offset
                        - wp.dot(offset, velocity_modifier_params.normal)
                        * velocity_modifier_params.normal
                    )
                    cosine = (
                        wp.dot(offset, velocity_modifier_params.horizontal_axis_1)
                        / horizontal_distance
                    )
                    theta = wp.acos(cosine)
                    if wp.dot(offset, velocity_modifier_params.horizontal_axis_2) > 0:
                        theta = theta
                    else:
                        theta = -theta
                    axis1_scale = (
                        -horizontal_distance
                        * wp.sin(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis2_scale = (
                        horizontal_distance
                        * wp.cos(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis_vertical_scale = translation_scale
                    state.particle_v[p] = (
                        axis1_scale * velocity_modifier_params.horizontal_axis_1
                        + axis2_scale * velocity_modifier_params.horizontal_axis_2
                        + axis_vertical_scale * velocity_modifier_params.normal
                    )

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # given normal direction, say [0,0,1]
    # gradually release grid velocities from start position to end position
    def release_particles_sequentially(
        self, normal, start_position, end_position, num_layers, start_time, end_time
    ):
        num_layers = 50
        point = [0, 0, 0]
        size = [0, 0, 0]
        axis = -1
        for i in range(3):
            if normal[i] == 0:
                point[i] = 1
                size[i] = 1
            else:
                axis = i
                point[i] = end_position

        half_length_portion = wp.abs(start_position - end_position) / num_layers
        end_time_portion = end_time / num_layers
        for i in range(num_layers):
            size[axis] = half_length_portion * (num_layers - i)
            self.enforce_particle_velocity_translation(
                point=point,
                size=size,
                velocity=[0, 0, 0],
                start_time=start_time,
                end_time=end_time_portion * (i + 1),
            )

    def enforce_particle_velocity_by_mask(
        self, mpm_state, selection_mask:torch.Tensor, velocity, start_time, end_time, device="cuda:0"
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        
        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.from_torch(selection_mask, device=device)

        wp.launch(
            kernel=selection_enforce_particle_velocity_translation,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/mpm_utils.py
================================================
import warp as wp
from diff_warp_utils import *
import numpy as np
import math


# compute stress from F
@wp.func
def kirchoff_stress_FCR(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, mu: float, lam: float
):
    # compute kirchoff stress for FCR model (remember tau = P F^T)
    R = U * wp.transpose(V)
    id = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    return 2.0 * mu * (F - R) * wp.transpose(F) + id * lam * J * (J - 1.0)


@wp.func
def kirchoff_stress_neoHookean(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, sig: wp.vec3, mu: float, lam: float
):
    # compute kirchoff stress for FCR model (remember tau = P F^T)
    b = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    b_hat = b - wp.vec3(
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
    )
    tau = mu * J ** (-2.0 / 3.0) * b_hat + lam / 2.0 * (J * J - 1.0) * wp.vec3(
        1.0, 1.0, 1.0
    )
    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_StVK(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    sig = wp.vec3(
        wp.max(sig[0], 0.01), wp.max(sig[1], 0.01), wp.max(sig[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    ONE = wp.vec3(1.0, 1.0, 1.0)
    tau = 2.0 * mu * epsilon + lam * log_sig_sum * ONE
    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_drucker_prager(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    center00 = 2.0 * mu * wp.log(sig[0]) * (1.0 / sig[0]) + lam * log_sig_sum * (
        1.0 / sig[0]
    )
    center11 = 2.0 * mu * wp.log(sig[1]) * (1.0 / sig[1]) + lam * log_sig_sum * (
        1.0 / sig[1]
    )
    center22 = 2.0 * mu * wp.log(sig[2]) * (1.0 / sig[2]) + lam * log_sig_sum * (
        1.0 / sig[2]
    )
    center = wp.mat33(center00, 0.0, 0.0, 0.0, center11, 0.0, 0.0, 0.0, center22)
    return U * center * wp.transpose(V) * wp.transpose(F)


@wp.func
def von_mises_return_mapping(F_trial: wp.mat33, model: MPMModelStruct, p: int):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


@wp.func
def von_mises_return_mapping_with_damage(
    F_trial: wp.mat33, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        if model.yield_stress[p] <= 0:
            return F_trial
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        model.yield_stress[p] = model.yield_stress[p] - model.softening * wp.length(
            (delta_gamma / epsilon_hat_norm) * epsilon_hat
        )
        if model.yield_stress[p] <= 0:
            model.mu[p] = 0.0
            model.lam[p] = 0.0
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


# for toothpaste
@wp.func
def viscoplasticity_return_mapping_with_StVK(
    F_trial: wp.mat33, model: MPMModelStruct, p: int, dt: float
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    b_trial = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    trace_epsilon = epsilon[0] + epsilon[1] + epsilon[2]
    epsilon_hat = epsilon - wp.vec3(
        trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
    )
    s_trial = 2.0 * model.mu[p] * epsilon_hat
    s_trial_norm = wp.length(s_trial)
    y = s_trial_norm - wp.sqrt(2.0 / 3.0) * model.yield_stress[p]
    if y > 0:
        mu_hat = model.mu[p] * (b_trial[0] + b_trial[1] + b_trial[2]) / 3.0
        s_new_norm = s_trial_norm - y / (
            1.0 + model.plastic_viscosity / (2.0 * mu_hat * dt)
        )
        s_new = (s_new_norm / s_trial_norm) * s_trial
        epsilon_new = 1.0 / (2.0 * model.mu[p]) * s_new + wp.vec3(
            trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
        )
        sig_elastic = wp.mat33(
            wp.exp(epsilon_new[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        return F_elastic
    else:
        return F_trial


@wp.func
def sand_return_mapping(
    F_trial: wp.mat33, state: MPMStateStruct, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig, V)

    epsilon = wp.vec3(
        wp.log(wp.max(wp.abs(sig[0]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[1]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[2]), 1e-14)),
    )
    sigma_out = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    tr = epsilon[0] + epsilon[1] + epsilon[2]  # + state.particle_Jp[p]
    epsilon_hat = epsilon - wp.vec3(tr / 3.0, tr / 3.0, tr / 3.0)
    epsilon_hat_norm = wp.length(epsilon_hat)
    delta_gamma = (
        epsilon_hat_norm
        + (3.0 * model.lam[p] + 2.0 * model.mu[p])
        / (2.0 * model.mu[p])
        * tr
        * model.alpha
    )

    if delta_gamma <= 0:
        F_elastic = F_trial

    if delta_gamma > 0 and tr > 0:
        F_elastic = U * wp.transpose(V)

    if delta_gamma > 0 and tr <= 0:
        H = epsilon - epsilon_hat * (delta_gamma / epsilon_hat_norm)
        s_new = wp.vec3(wp.exp(H[0]), wp.exp(H[1]), wp.exp(H[2]))

        F_elastic = U * wp.diag(s_new) * wp.transpose(V)
    return F_elastic


@wp.kernel
def compute_mu_lam_from_E_nu(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    model.mu[p] = model.E[p] / (2.0 * (1.0 + model.nu[p]))
    model.lam[p] = (
        model.E[p] * model.nu[p] / ((1.0 + model.nu[p]) * (1.0 - 2.0 * model.nu[p]))
    )


@wp.kernel
def zero_grid(state: MPMStateStruct, model: MPMModelStruct):
    grid_x, grid_y, grid_z = wp.tid()
    state.grid_m[grid_x, grid_y, grid_z] = 0.0
    state.grid_v_in[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)
    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)


@wp.func
def compute_dweight(
    model: MPMModelStruct, w: wp.mat33, dw: wp.mat33, i: int, j: int, k: int
):
    dweight = wp.vec3(
        dw[0, i] * w[1, j] * w[2, k],
        w[0, i] * dw[1, j] * w[2, k],
        w[0, i] * w[1, j] * dw[2, k],
    )
    return dweight * model.inv_dx


@wp.func
def update_cov(state: MPMStateStruct, p: int, grad_v: wp.mat33, dt: float):
    cov_n = wp.mat33(0.0)
    cov_n[0, 0] = state.particle_cov[p * 6]
    cov_n[0, 1] = state.particle_cov[p * 6 + 1]
    cov_n[0, 2] = state.particle_cov[p * 6 + 2]
    cov_n[1, 0] = state.particle_cov[p * 6 + 1]
    cov_n[1, 1] = state.particle_cov[p * 6 + 3]
    cov_n[1, 2] = state.particle_cov[p * 6 + 4]
    cov_n[2, 0] = state.particle_cov[p * 6 + 2]
    cov_n[2, 1] = state.particle_cov[p * 6 + 4]
    cov_n[2, 2] = state.particle_cov[p * 6 + 5]

    cov_np1 = cov_n + dt * (grad_v * cov_n + cov_n * wp.transpose(grad_v))

    state.particle_cov[p * 6] = cov_np1[0, 0]
    state.particle_cov[p * 6 + 1] = cov_np1[0, 1]
    state.particle_cov[p * 6 + 2] = cov_np1[0, 2]
    state.particle_cov[p * 6 + 3] = cov_np1[1, 1]
    state.particle_cov[p * 6 + 4] = cov_np1[1, 2]
    state.particle_cov[p * 6 + 5] = cov_np1[2, 2]


@wp.kernel
def p2g_apic_with_stress(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    # input given to p2g:   particle_stress
    #                       particle_x
    #                       particle_v
    #                       particle_C
    # output:               grid_v_in, grid_m
    p = wp.tid()
    if state.particle_selection[p] == 0:
        stress = state.particle_stress[p]
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    dpos = (
                        wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    ) * model.dx
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    dweight = compute_dweight(model, w, dw, i, j, k)

                    C = state.particle_C[p]
                    # if model.rpic = 0, standard apic
                    C = (1.0 - model.rpic_damping) * C + model.rpic_damping / 2.0 * (
                        C - wp.transpose(C)
                    )

                    # C = (1.0 - model.rpic_damping) * state.particle_C[
                    #     p
                    # ] + model.rpic_damping / 2.0 * (
                    #     state.particle_C[p] - wp.transpose(state.particle_C[p])
                    # )

                    if model.rpic_damping < -0.001:
                        # standard pic
                        C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                    elastic_force = -state.particle_vol[p] * stress * dweight
                    v_in_add = (
                        weight
                        * state.particle_mass[p]
                        * (state.particle_v[p] + C * dpos)
                        + dt * elastic_force
                    )
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


# add gravity
@wp.kernel
def grid_normalization_and_gravity(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    grid_x, grid_y, grid_z = wp.tid()
    if state.grid_m[grid_x, grid_y, grid_z] > 1e-15:
        v_out = state.grid_v_in[grid_x, grid_y, grid_z] * (
            1.0 / state.grid_m[grid_x, grid_y, grid_z]
        )
        # add gravity
        v_out = v_out + dt * model.gravitational_accelaration
        state.grid_v_out[grid_x, grid_y, grid_z] = v_out


@wp.kernel
def g2p(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = new_v + grid_v * weight
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        state.particle_v[p] = new_v
        # state.particle_x[p] = state.particle_x[p] + dt * new_v
        # state.particle_x[p] = state.particle_x[p] + dt * state.particle_v[p]
        wp.atomic_add(state.particle_x, p, dt * state.particle_v[p])
        state.particle_C[p] = new_C
        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        state.particle_F_trial[p] = F_tmp

        if model.update_cov_with_F:
            update_cov(state, p, new_F, dt)


# compute (Kirchhoff) stress = stress(returnMap(F_trial))
@wp.kernel
def compute_stress_from_F_trial(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        # apply return mapping
        if model.material == 1:  # metal
            state.particle_F[p] = von_mises_return_mapping(
                state.particle_F_trial[p], model, p
            )
        elif model.material == 2:  # sand
            state.particle_F[p] = sand_return_mapping(
                state.particle_F_trial[p], state, model, p
            )
        elif model.material == 3:  # visplas, with StVk+VM, no thickening
            state.particle_F[p] = viscoplasticity_return_mapping_with_StVK(
                state.particle_F_trial[p], model, p, dt
            )
        elif model.material == 5:
            state.particle_F[p] = von_mises_return_mapping_with_damage(
                state.particle_F_trial[p], model, p
            )
        else:  # elastic, jelly
            state.particle_F[p] = state.particle_F_trial[p]

        # also compute stress here
        J = wp.determinant(state.particle_F[p])
        U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        sig = wp.vec3(0.0)
        stress = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        wp.svd3(state.particle_F[p], U, sig, V)
        if model.material == 0 or model.material == 5:
            stress = kirchoff_stress_FCR(
                state.particle_F[p], U, V, J, model.mu[p], model.lam[p]
            )
        if model.material == 1:
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 2:
            stress = kirchoff_stress_drucker_prager(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 3:
            # temporarily use stvk, subject to change
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )

        # stress = (stress + wp.transpose(stress)) / 2.0  # enfore symmetry
        state.particle_stress[p] = (stress + wp.transpose(stress)) / 2.0


@wp.kernel
def compute_cov_from_F(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    F = state.particle_F_trial[p]

    init_cov = wp.mat33(0.0)
    init_cov[0, 0] = state.particle_init_cov[p * 6]
    init_cov[0, 1] = state.particle_init_cov[p * 6 + 1]
    init_cov[0, 2] = state.particle_init_cov[p * 6 + 2]
    init_cov[1, 0] = state.particle_init_cov[p * 6 + 1]
    init_cov[1, 1] = state.particle_init_cov[p * 6 + 3]
    init_cov[1, 2] = state.particle_init_cov[p * 6 + 4]
    init_cov[2, 0] = state.particle_init_cov[p * 6 + 2]
    init_cov[2, 1] = state.particle_init_cov[p * 6 + 4]
    init_cov[2, 2] = state.particle_init_cov[p * 6 + 5]

    cov = F * init_cov * wp.transpose(F)

    state.particle_cov[p * 6] = cov[0, 0]
    state.particle_cov[p * 6 + 1] = cov[0, 1]
    state.particle_cov[p * 6 + 2] = cov[0, 2]
    state.particle_cov[p * 6 + 3] = cov[1, 1]
    state.particle_cov[p * 6 + 4] = cov[1, 2]
    state.particle_cov[p * 6 + 5] = cov[2, 2]


@wp.kernel
def compute_R_from_F(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    F = state.particle_F_trial[p]

    # polar svd decomposition
    U = wp.mat33(0.0)
    V = wp.mat33(0.0)
    sig = wp.vec3(0.0)
    wp.svd3(F, U, sig, V)

    if wp.determinant(U) < 0.0:
        U[0, 2] = -U[0, 2]
        U[1, 2] = -U[1, 2]
        U[2, 2] = -U[2, 2]

    if wp.determinant(V) < 0.0:
        V[0, 2] = -V[0, 2]
        V[1, 2] = -V[1, 2]
        V[2, 2] = -V[2, 2]

    # compute rotation matrix
    R = U * wp.transpose(V)
    state.particle_R[p] = wp.transpose(R)


@wp.kernel
def add_damping_via_grid(state: MPMStateStruct, scale: float):
    grid_x, grid_y, grid_z = wp.tid()
    state.grid_v_out[grid_x, grid_y, grid_z] = (
        state.grid_v_out[grid_x, grid_y, grid_z] * scale
    )


@wp.kernel
def apply_additional_params(
    state: MPMStateStruct,
    model: MPMModelStruct,
    params_modifier: MaterialParamsModifier,
):
    p = wp.tid()
    pos = state.particle_x[p]
    if (
        pos[0] > params_modifier.point[0] - params_modifier.size[0]
        and pos[0] < params_modifier.point[0] + params_modifier.size[0]
        and pos[1] > params_modifier.point[1] - params_modifier.size[1]
        and pos[1] < params_modifier.point[1] + params_modifier.size[1]
        and pos[2] > params_modifier.point[2] - params_modifier.size[2]
        and pos[2] < params_modifier.point[2] + params_modifier.size[2]
    ):
        model.E[p] = params_modifier.E
        model.nu[p] = params_modifier.nu
        state.particle_density[p] = params_modifier.density


@wp.kernel
def selection_add_impulse_on_particles(
    state: MPMStateStruct, impulse_modifier: Impulse_modifier
):
    p = wp.tid()
    offset = state.particle_x[p] - impulse_modifier.point
    if (
        wp.abs(offset[0]) < impulse_modifier.size[0]
        and wp.abs(offset[1]) < impulse_modifier.size[1]
        and wp.abs(offset[2]) < impulse_modifier.size[2]
    ):
        impulse_modifier.mask[p] = 1
    else:
        impulse_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_translation(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point
    if (
        wp.abs(offset[0]) < velocity_modifier.size[0]
        and wp.abs(offset[1]) < velocity_modifier.size[1]
        and wp.abs(offset[2]) < velocity_modifier.size[2]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_cylinder(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point

    vertical_distance = wp.abs(wp.dot(offset, velocity_modifier.normal))

    horizontal_distance = wp.length(
        offset - wp.dot(offset, velocity_modifier.normal) * velocity_modifier.normal
    )
    if (
        vertical_distance < velocity_modifier.half_height_and_radius[0]
        and horizontal_distance < velocity_modifier.half_height_and_radius[1]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/run_gaussian.py
================================================
import time
import numpy as np
from fire import Fire
import os
import warp as wp
from mpm_solver_warp import MPM_Simulator_WARP
from engine_utils import *
import torch
from tqdm import tqdm


def load_gaussians(input_dir: str = None):
    name_dict = {
        "position": "pos.npy",  # [T, N, 3]
        "rotation": "rot.npy",  # [T, N, 4]
        "cov": "cov.npy",  # [T, N, 6]
    }

    assert os.path.exists(input_dir), "Input directory does not exist"

    ret_dict = {}

    for key, value in name_dict.items():
        ret_dict[key] = np.load(os.path.join(input_dir, value))

    pos_max = ret_dict["position"].max()
    pos_min = ret_dict["position"].min()

    # ret_dict["position"] = (ret_dict["position"]) / (pos_max - pos_min) * 0.5 - pos_min / (pos_max - pos_min) * 0.5 + 0.1
    # ret_dict["cov"] = ret_dict["cov"] / (pos_max - pos_min) * 0.5
    scale = (pos_max - pos_min) * 2.0
    shift = -pos_min

    ret_dict["position"] = (ret_dict["position"] + shift) / scale
    ret_dict["cov"] = ret_dict["cov"] / scale

    # pos_new = (pos + shift ) / scale
    # pos_orign = pos_new * scale  - shift
    return ret_dict, scale, shift


def init_volume(xyz, grid=[-1, 1], num_grid=20):
    pass


def run_mpm_gaussian(input_dir, output_dir=None, fps=6, device=0):
    wp.init()
    wp.config.verify_cuda = True

    device = "cuda:{}".format(device)

    gaussian_dict, scale, shift = load_gaussians(input_dir)

    velocity_scaling = 10
    velocity = (
        (gaussian_dict["position"][1:] - gaussian_dict["position"][:-1])
        / fps
        * velocity_scaling
    )

    velocity_abs = np.abs(velocity)
    print(
        "velocity mean-max-min",
        velocity_abs.mean(),
        velocity_abs.max(),
        velocity_abs.min(),
    )

    init_velocity = velocity[0]
    init_position = gaussian_dict["position"][0]
    init_rotation = gaussian_dict["rotation"][0]
    init_cov = gaussian_dict["cov"][0]
    tensor_init_pos = torch.from_numpy(init_position).float().to(device)
    tensor_init_cov = torch.from_numpy(init_cov).float().to(device)
    tensor_init_velocity = torch.from_numpy(init_velocity).float().to(device)

    # print(tensor_init_pos.max(), tensor_init_pos.min(), tensor_init_pos.shape)

    mpm_solver = MPM_Simulator_WARP(
        10
    )  # initialize with whatever number is fine. it will be reintialized

    # TODO, Compute volume later
    volume_tensor = (
        torch.ones(
            init_velocity.shape[0],
        )
        * 2.5e-8  # m^3
    )

    mpm_solver.load_initial_data_from_torch(
        tensor_init_pos,
        volume_tensor,
        tensor_init_cov,
        tensor_init_velocity,
        device=device,
    )
    # mpm_solver.load_initial_data_from_torch(
    #     tensor_init_pos, volume_tensor, device=device
    # )

    position_tensor = mpm_solver.export_particle_x_to_torch()
    velo = wp.to_torch(mpm_solver.mpm_state.particle_v)
    cov = wp.to_torch(mpm_solver.mpm_state.particle_init_cov)
    print(
        "pos in box: ",
        position_tensor.max(),
        position_tensor.min(),
    )

    material_params = {
        "E": 0.0002,  # 0.1-200 MPa
        "nu": 0.4,  # > 0.35
        "material": "jelly",
        # "friction_angle": 25,
        "g": [0.0, 0.0, 0],
        "density": 1,  # kg / m^3
    }

    print("pre set")
    mpm_solver.set_parameters_dict(material_params)
    print("set")
    mpm_solver.finalize_mu_lam()  # set mu and lambda from the E and nu input
    print("finalize")
    # mpm_solver.add_surface_collider((0.0, 0.0, 0.13), (0.0,0.0,1.0), 'sticky', 0.0)

    if output_dir is None:
        output_dir = "./gaussian_sim_results"
    os.makedirs(output_dir, exist_ok=True)

    # save_data_at_frame(mpm_solver, output_dir, 0, save_to_ply=True, save_to_h5=False)
    pos_list = []
    pos = mpm_solver.export_particle_x_to_torch().clone()
    pos = (pos * scale) - shift
    pos_list.append(pos.detach().clone())

    total_time = 20
    time_step = 0.002
    total_iters = int(total_time / time_step)

    for k in tqdm(range(1, total_iters)):
        mpm_solver.p2g2p(k, time_step, device=device)

        if k % 50 == 0:
            pos = mpm_solver.export_particle_x_to_torch().clone()
            pos = (pos * scale) - shift
            pos_list.append(pos.detach().clone())
            print(k)
            print(pos.max().item(), pos.min().item(), pos.mean().item())
        # save_data_at_frame(mpm_solver, output_dir, k, save_to_ply=True, save_to_h5=False)

    save_name = ""
    for key, value in material_params.items():
        if key == "g":
            continue
        save_name += "{}_{}_".format(key, value)

    save_name += "_timestep_{}_vs{}_totaltime_{}".format(
        time_step, velocity_scaling, total_time
    )

    render_gaussians(pos_list, save_name)


def code_test(input_dir, device=0):
    device = "cuda:{}".format(device)
    gaussian_dict, scale, shift = load_gaussians(input_dir)
    pos = gaussian_dict["position"]

    pos = (pos * scale) - shift

    pos = torch.from_numpy(pos).float().to(device)

    render_gaussians(pos)


def render_gaussians(
    pos_list,
    save_name=None,
    dataset_dir="../../data/physics_dreamer/llff_flower_undistorted",
):
    from motionrep.data.datasets.multiview_dataset import MultiviewImageDataset
    from motionrep.data.datasets.multiview_dataset import (
        camera_dataset_collate_fn as camera_dataset_collate_fn_img,
    )

    from motionrep.gaussian_3d.gaussian_renderer.render import render_gaussian
    from motionrep.gaussian_3d.scene import GaussianModel
    from typing import NamedTuple

    gaussian_path = os.path.join(dataset_dir, "point_cloud.ply")
    test_dataset = MultiviewImageDataset(
        dataset_dir,
        use_white_background=False,
        resolution=[576, 1024],
        use_index=list(range(5, 30, 4)),
    )
    print(
        "len of train dataset",
        len(test_dataset),
        "len of test dataset",
        len(test_dataset),
    )
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=False,
        drop_last=True,
        num_workers=0,
        collate_fn=camera_dataset_collate_fn_img,
    )

    class RenderPipe(NamedTuple):
        convert_SHs_python = False
        compute_cov3D_python = False
        debug = False

    class RenderParams(NamedTuple):
        render_pipe: RenderPipe
        bg_color: bool
        gaussians: GaussianModel
        camera_list: list

    gaussians = GaussianModel(3)
    camera_list = test_dataset.camera_list

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()
    print(
        "load gaussians from: {}".format(gaussian_path),
        "... num gaussians: ",
        gaussians._xyz.shape[0],
    )
    bg_color = [1, 1, 1] if False else [0, 0, 0]
    background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
    render_pipe = RenderPipe()

    render_params = RenderParams(
        render_pipe=render_pipe,
        bg_color=background,
        gaussians=gaussians,
        camera_list=camera_list,
    )

    data = next(iter(test_dataloader))
    cam = data["cam"][0]

    ret_img_list = []

    for i in range(len(pos_list) + 1):
        if i > 0:
            xyz = pos_list[i - 1]
            gaussians._xyz = xyz

        img = render_gaussian(
            cam,
            gaussians,
            render_params.render_pipe,
            background,
        )["render"]

        ret_img_list.append(img)

    # [T, C, H, W]
    video_array = torch.stack(ret_img_list, dim=0)
    video_numpy = video_array.detach().cpu().numpy() * 255
    video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)

    video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])
    from motionrep.utils.io_utils import save_video_imageio

    if save_name is None:
        save_path = "test.mp4"
    else:
        save_path = save_name + ".mp4"
    print("save video to ", save_path)
    save_video_imageio(save_path, video_numpy, fps=10)


if __name__ == "__main__":
    Fire(run_mpm_gaussian)
    # Fire(code_test)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/run_gaussian_static.py
================================================
import time
import numpy as np
from fire import Fire
import os
import warp as wp
from mpm_solver_warp import MPM_Simulator_WARP
from engine_utils import *
import torch
from tqdm import tqdm
from motionrep.gaussian_3d.scene import GaussianModel


def load_gaussians(input_dir: str = None):
    if not input_dir.endswith("ply"):
        gaussian_path = os.path.join(input_dir, "point_cloud.ply")
    else:
        gaussian_path = input_dir

    gaussians = GaussianModel(3)

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()

    pos = gaussians._xyz.detach().cpu().numpy()

    pos_max = pos.max()
    pos_min = pos.min()

    scale = (pos_max - pos_min) * 2.5
    shift = -pos_min + (pos_max - pos_min) * 0.25

    pos = (pos + shift) / scale

    cov = gaussians.get_covariance().detach().cpu().numpy()
    cov = cov / scale

    velocity = np.zeros_like(pos)

    height = pos[:, 2] - pos[:, 2].min()
    height_thres = 10
    velocity_mask = height > np.percentile(height, height_thres)

    static_points = pos[np.logical_not(velocity_mask)]

    static_points_mean = static_points.mean(axis=0)
    static_points_dist = static_points - static_points_mean
    max_static_offset = np.abs(static_points_dist).max(axis=0) * 0.8

    # boundary condition, set velocity to 0

    # x_velocity = np.sqrt(np.abs(pos[:, 0]) + 1e-8) * np.sign(pos[:, 0])
    # x_velocity = np.sqrt(height) * 0.1
    x_velocity = height**0.2 * 0.1
    velocity[velocity_mask, 0] = x_velocity[velocity_mask]
    velocity[velocity_mask, 1] = x_velocity[velocity_mask]

    ret_dict = {
        "position": pos,  # numpy [N, 3]
        "cov": cov,  # numpy [N, 6]
        "velocity": velocity,  # numpy [N, 3]
        "satic_center_point": static_points_mean,  # numpy [3]
        "max_static_offset": max_static_offset,  # numpy [3]
    }

    return ret_dict, scale, shift


def get_volume(xyzs: np.ndarray, resolution=128) -> np.ndarray:
    print("Compute Volume for each points")
    voxel_counts = np.zeros((resolution, resolution, resolution))

    points_xyzindex = ((xyzs + 1) / 2 * (resolution - 1)).astype(np.uint32)

    for x, y, z in points_xyzindex:
        voxel_counts[x, y, z] += 1

    points_number_in_corresponding_voxel = voxel_counts[
        points_xyzindex[:, 0], points_xyzindex[:, 1], points_xyzindex[:, 2]
    ]

    cell_volume = (2.0 / (resolution - 1)) ** 3

    points_volume = cell_volume / points_number_in_corresponding_voxel

    points_volume = points_volume.astype(np.float32)

    print(
        "mean volume",
        points_volume.mean(),
        "max volume",
        points_volume.max(),
        "min volume",
        points_volume.min(),
    )

    return points_volume


def run_mpm_gaussian(input_dir, output_dir=None, fps=6, device=0):
    wp.init()
    wp.config.verify_cuda = True

    device = "cuda:{}".format(device)

    gaussian_dict, scale, shift = load_gaussians(input_dir)

    velocity_scaling = 0.5

    init_velocity = velocity_scaling * gaussian_dict["velocity"]
    init_position = gaussian_dict["position"]
    init_cov = gaussian_dict["cov"]

    volume_array_path = os.path.join(input_dir, "volume_array.npy")
    if os.path.exists(volume_array_path):
        volume_tensor = torch.from_numpy(np.load(volume_array_path)).float().to(device)
    else:
        volume_array = get_volume(init_position)
        np.save(volume_array_path, volume_array)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)

    tensor_init_pos = torch.from_numpy(init_position).float().to(device)
    tensor_init_cov = torch.from_numpy(init_cov).float().to(device)
    tensor_init_velocity = torch.from_numpy(init_velocity).float().to(device)

    print(
        "init position:",
        tensor_init_pos.max(),
        tensor_init_pos.min(),
        tensor_init_pos.shape,
    )
    velocity_abs = np.abs(init_velocity)
    print(
        "velocity mean-max-min",
        velocity_abs.mean(),
        velocity_abs.max(),
        velocity_abs.min(),
    )

    mpm_solver = MPM_Simulator_WARP(
        10
    )  # initialize with whatever number is fine. it will be reintialized

    mpm_solver.load_initial_data_from_torch(
        tensor_init_pos,
        volume_tensor,
        tensor_init_cov,
        tensor_init_velocity,
        device=device,
    )
    # mpm_solver.load_initial_data_from_torch(
    #     tensor_init_pos, volume_tensor, device=device
    # )

    # set boundary conditions
    static_center_point = (
        torch.from_numpy(gaussian_dict["satic_center_point"]).float().to(device)
    )
    max_static_offset = (
        torch.from_numpy(gaussian_dict["max_static_offset"]).float().to(device)
    )
    velocity = torch.zeros_like(static_center_point)
    mpm_solver.enforce_particle_velocity_translation(
        static_center_point,
        max_static_offset,
        velocity,
        start_time=0,
        end_time=1000,
        device=device,
    )

    position_tensor = mpm_solver.export_particle_x_to_torch()
    velo = wp.to_torch(mpm_solver.mpm_state.particle_v)
    cov = wp.to_torch(mpm_solver.mpm_state.particle_init_cov)
    print(
        "pos in box: ",
        position_tensor.max(),
        position_tensor.min(),
    )

    material_params = {
        "E": 0.2,  # 0.1-200 MPa
        "nu": 0.1,  # > 0.35
        "material": "jelly",
        # "material": "metal",
        # "friction_angle": 25,
        "g": [0.0, 0.0, 0],
        "density": 0.2,  # kg / m^3
    }

    print("pre set")
    mpm_solver.set_parameters_dict(material_params)
    print("set")
    mpm_solver.finalize_mu_lam()  # set mu and lambda from the E and nu input
    print("finalize")
    # mpm_solver.add_surface_collider((0.0, 0.0, 0.13), (0.0,0.0,1.0), 'sticky', 0.0)

    if output_dir is None:
        output_dir = "../../output/gaussian_sim_results"
    os.makedirs(output_dir, exist_ok=True)

    # save_data_at_frame(mpm_solver, output_dir, 0, save_to_ply=True, save_to_h5=False)
    pos_list = []
    pos = mpm_solver.export_particle_x_to_torch().clone()
    pos = (pos * scale) - shift
    pos_list.append(pos.detach().clone())

    total_time = 10
    time_step = 0.001
    total_iters = int(total_time / time_step)

    save_dict = {
        "pos_init": mpm_solver.export_particle_x_to_torch()
        .clone()
        .detach()
        .cpu()
        .numpy(),
        "velo_init": mpm_solver.export_particle_v_to_torch()
        .clone()
        .detach()
        .cpu()
        .numpy(),
        "pos_list": [],
    }

    for k in tqdm(range(1, total_iters)):
        mpm_solver.p2g2p(k, time_step, device=device)

        if k < 20:
            pos = mpm_solver.export_particle_x_to_torch().clone().detach().cpu().numpy()
            save_dict["pos_list"].append(pos)

        if k % 100 == 0:
            pos = mpm_solver.export_particle_x_to_torch().clone()
            pos = (pos * scale) - shift
            pos_list.append(pos.detach().clone())
            print(k)
            print(pos.max().item(), pos.min().item(), pos.mean().item())
        # save_data_at_frame(mpm_solver, output_dir, k, save_to_ply=True, save_to_h5=False)

    save_name = ""
    for key, value in material_params.items():
        if key == "g":
            continue
        save_name += "{}_{}_".format(key, value)

    save_name += "_timestep_{}_vs{}_totaltime_{}".format(
        time_step, velocity_scaling, total_time
    )

    render_gaussians(pos_list, save_name)

    # save sim data:
    save_path = os.path.join(output_dir, save_name + ".pkl")
    import pickle

    with open(save_path, "wb") as f:
        pickle.dump(save_dict, f)


def code_test(input_dir, device=0):
    device = "cuda:{}".format(device)
    gaussian_dict, scale, shift = load_gaussians(input_dir)
    pos = gaussian_dict["position"]

    pos = (pos * scale) - shift

    pos = torch.from_numpy(pos).float().to(device)

    render_gaussians(pos)


def render_gaussians(
    pos_list,
    save_name=None,
    # dataset_dir="../../data/physics_dreamer/llff_flower_undistorted",
    dataset_dir="../../data/physics_dreamer/ficus",
):
    from motionrep.data.datasets.multiview_dataset import MultiviewImageDataset
    from motionrep.data.datasets.multiview_dataset import (
        camera_dataset_collate_fn as camera_dataset_collate_fn_img,
    )

    from motionrep.gaussian_3d.gaussian_renderer.render import render_gaussian
    from motionrep.gaussian_3d.scene import GaussianModel
    from typing import NamedTuple

    gaussian_path = os.path.join(dataset_dir, "point_cloud.ply")
    test_dataset = MultiviewImageDataset(
        dataset_dir,
        use_white_background=False,
        resolution=[576, 1024],
        use_index=list(range(5, 30, 4)),
        scale_x_angle=1.5,
    )
    print(
        "len of train dataset",
        len(test_dataset),
        "len of test dataset",
        len(test_dataset),
    )
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=False,
        drop_last=True,
        num_workers=0,
        collate_fn=camera_dataset_collate_fn_img,
    )

    class RenderPipe(NamedTuple):
        convert_SHs_python = False
        compute_cov3D_python = False
        debug = False

    class RenderParams(NamedTuple):
        render_pipe: RenderPipe
        bg_color: bool
        gaussians: GaussianModel
        camera_list: list

    gaussians = GaussianModel(3)
    camera_list = test_dataset.camera_list

    gaussians.load_ply(gaussian_path)
    gaussians.detach_grad()
    print(
        "load gaussians from: {}".format(gaussian_path),
        "... num gaussians: ",
        gaussians._xyz.shape[0],
    )
    bg_color = [1, 1, 1] if False else [0, 0, 0]
    background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
    render_pipe = RenderPipe()

    render_params = RenderParams(
        render_pipe=render_pipe,
        bg_color=background,
        gaussians=gaussians,
        camera_list=camera_list,
    )

    data = next(iter(test_dataloader))
    cam = data["cam"][0]

    ret_img_list = []

    for i in range(len(pos_list) + 1):
        if i > 0:
            xyz = pos_list[i - 1]
            gaussians._xyz = xyz

        img = render_gaussian(
            cam,
            gaussians,
            render_params.render_pipe,
            background,
        )["render"]

        ret_img_list.append(img)

    # [T, C, H, W]
    video_array = torch.stack(ret_img_list, dim=0)
    video_numpy = video_array.detach().cpu().numpy() * 255
    video_numpy = np.clip(video_numpy, 0, 255).astype(np.uint8)

    video_numpy = np.transpose(video_numpy, [0, 2, 3, 1])
    from motionrep.utils.io_utils import save_video_imageio

    if save_name is None:
        save_path = "output/test.mp4"
    else:
        save_path = os.path.join("output", save_name + ".mp4")
    print("save video to ", save_path)
    save_video_imageio(save_path, video_numpy, fps=10)


if __name__ == "__main__":
    Fire(run_mpm_gaussian)
    # Fire(code_test)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/run_sand.py
================================================

import warp as wp
from mpm_solver_warp import MPM_Simulator_WARP
from engine_utils import *
import torch
wp.init()
wp.config.verify_cuda = True


dvc = "cuda:0"

mpm_solver = MPM_Simulator_WARP(10) # initialize with whatever number is fine. it will be reintialized


# You can either load sampling data from an external h5 file, containing initial position (n,3) and particle_volume (n,)
mpm_solver.load_from_sampling("sand_column.h5", n_grid = 150, device=dvc)

# Or load from torch tensor (also position and volume)
# Here we borrow the data from h5, but you can use your own
# [N]
volume_tensor = torch.ones(mpm_solver.n_particles) * 2.5e-8

# torch.float32, [N, 3]
position_tensor = mpm_solver.export_particle_x_to_torch()
print(position_tensor.max(), position_tensor.min())

mpm_solver.load_initial_data_from_torch(position_tensor, volume_tensor)
print(position_tensor.shape, position_tensor.dtype, volume_tensor.shape, volume_tensor.dtype)

# Note: You must provide 'density=..' to set particle_mass = density * particle_volume

material_params = {
    'E': 2000,
    'nu': 0.2,
    "material": "sand",
    'friction_angle': 35,
    'g': [0.0, 0.0, -4.0],
    "density": 200.0
}
mpm_solver.set_parameters_dict(material_params)

mpm_solver.finalize_mu_lam() # set mu and lambda from the E and nu input

mpm_solver.add_surface_collider((0.0, 0.0, 0.13), (0.0,0.0,1.0), 'sticky', 0.0)

from IPython import embed
# embed()

directory_to_save = './sim_results'

save_data_at_frame(mpm_solver, directory_to_save, 0, save_to_ply=True, save_to_h5=False)

for k in range(1,50):
    mpm_solver.p2g2p(k, 0.002, device=dvc)
    save_data_at_frame(mpm_solver, directory_to_save, k, save_to_ply=True, save_to_h5=False)


# extract the position, make some changes, load it back
position = mpm_solver.export_particle_x_to_torch()
# e.g. we shift the x position
position[:,0] = position[:,0] + 0.1
mpm_solver.import_particle_x_from_torch(position)
# keep running sim
for k in range(50,100):

    mpm_solver.p2g2p(k, 0.002, device=dvc)
    save_data_at_frame(mpm_solver, directory_to_save, k, save_to_ply=True, save_to_h5=False)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/sim_grad.py
================================================
import warp as wp
import numpy as np
import torch
import os
from mpm_solver_warp_diff import MPM_Simulator_WARPDiff
from run_gaussian_static import load_gaussians, get_volume
from tqdm import tqdm 
from fire import Fire 
from mpm_utils import *

from typing import Optional

import warp as wp


class MyTape(wp.Tape):

    # returns the adjoint of a kernel parameter
    def get_adjoint(self, a):
        if not wp.types.is_array(a) and not isinstance(a, wp.codegen.StructInstance):
            # if input is a simple type (e.g.: float, vec3, etc) then
            # no gradient needed (we only return gradients through arrays and structs)
            print("input is a simple type", type(a))
            return a

        elif wp.types.is_array(a) and a.grad:
            # keep track of all gradients used by the tape (for zeroing)
            # ignore the scalar loss since we don't want to clear its grad
            self.gradients[a] = a.grad
            return a.grad

        elif isinstance(a, wp.codegen.StructInstance):
            adj = a._cls()
            for name, _ in a._cls.ctype._fields_:
                if name.startswith("_"):
                    continue
                if isinstance(a._cls.vars[name].type, wp.array):
                    arr = getattr(a, name)
                    if arr is None:
                        continue
                    if arr.grad:
                        grad = self.gradients[arr] = arr.grad
                    else:
                        grad = wp.zeros_like(arr)
                    setattr(adj, name, grad)
                else:
                    setattr(adj, name, getattr(a, name))

            self.gradients[a] = adj
            return adj

        return None


def test(input_dir, output_dir=None, fps=6, device=0):
    wp.init()
    wp.config.verify_cuda = True

    device = "cuda:{}".format(device)

    gaussian_dict, scale, shift = load_gaussians(input_dir)

    velocity_scaling = 0.5
    init_velocity = velocity_scaling * gaussian_dict["velocity"]
    init_position = gaussian_dict["position"]
    init_cov = gaussian_dict["cov"]

    volume_array_path = os.path.join(input_dir, "volume_array.npy")
    if os.path.exists(volume_array_path):
        volume_tensor = torch.from_numpy(np.load(volume_array_path)).float().to(device)
    else:
        volume_array = get_volume(init_position)
        np.save(volume_array_path, volume_array)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)

    tensor_init_pos = torch.from_numpy(init_position).float().to(device)
    tensor_init_cov = torch.from_numpy(init_cov).float().to(device)
    tensor_init_velocity = torch.from_numpy(init_velocity).float().to(device)

    mpm_solver = MPM_Simulator_WARPDiff(10)  # initialize with whatever number is fine. it will be reintialized

    tensor_init_pos.requires_grad = True
    tensor_init_cov.requires_grad = False
    tensor_init_velocity.requires_grad = True
    mpm_solver.load_initial_data_from_torch(
        tensor_init_pos,
        volume_tensor,
        tensor_init_cov,
        tensor_init_velocity,
        device=device,
    )
    mpm_solver.mpm_state.particle_x = wp.from_numpy(init_position, dtype=wp.vec3, requires_grad=True, device=device)
    mpm_solver.mpm_state.particle_v = wp.from_numpy(init_velocity, dtype=wp.vec3, requires_grad=True, device=device)

    # set boundary conditions
    static_center_point = (
        torch.from_numpy(gaussian_dict["satic_center_point"]).float().to(device)
    )
    max_static_offset = (
        torch.from_numpy(gaussian_dict["max_static_offset"]).float().to(device)
    )
    velocity = torch.zeros_like(static_center_point)
    # mpm_solver.enforce_particle_velocity_translation(static_center_point, max_static_offset, velocity, 
    #                                                  start_time=0, end_time=1000, device=device)

    material_params = {
        "E": 2.0,  # 0.1-200 MPa
        "nu": 0.1,  # > 0.35
        "material": "jelly",
        # "material": "metal",
        # "friction_angle": 25,
        "g": [0.0, 0.0, 0],
        "density": 0.02,  # kg / m^3
    }

    print("pre set")
    mpm_solver.set_parameters_dict(material_params)
    print("set")
    mpm_solver.finalize_mu_lam()  # set mu and lambda from the E and nu input
    print("finalize")

    total_time = 0.1
    time_step = 0.01
    total_iters = int(total_time / time_step)
    total_iters = 3
    loss = torch.zeros(1, device=device)
    loss = wp.from_torch(loss, requires_grad=True)

    E_tensor = (torch.ones(velocity.shape[0]) * 2.0).contiguous().to(device)
    nu_tensor = (torch.ones(velocity.shape[0]) * 0.1).contiguous().to(device)
    E_warp = wp.from_torch(E_tensor, requires_grad=True)
    nu_warp = wp.from_torch(nu_tensor, requires_grad=True)

    mpm_solver.set_require_grad()

    dt = time_step
    # from IPython import embed; embed()
    with tape:
        # mpm_solver.reset_material(E_warp, nu_warp, device=device)
        # for k in tqdm(range(1, total_iters)):
        # mpm_solver.p2g2p(k, time_step, device=device)

        wp.launch(
            kernel=g2p_test,
            dim=mpm_solver.n_particles,
            inputs=[mpm_solver.mpm_state, mpm_solver.mpm_model, dt],
            device=device,
        )  # x, v, C, F_trial are updated
        # wp.launch(position_loss_kernel, dim=mpm_solver.n_particles,  inputs=[mpm_solver.mpm_state, loss], device=device)
        for i in range(2):
            # wp.launch(position_loss_kernel, dim=mpm_solver.n_particles,  inputs=[mpm_solver.mpm_state, loss], device=device)
            wp.launch(position_loss_kernel, dim=mpm_solver.n_particles,  inputs=[mpm_state, loss], device=device)
            # wp.launch(position_loss_kernel_raw, dim=mpm_solver.n_particles,  inputs=[mpm_state.particle_x, loss], device=device)
        
    tape.backward(loss) # 75120.86
    
    print(loss)
    # model_grad = tape.gradients[mpm_solver.mpm_model]
    # state_grad = tape.gradients[mpm_solver.mpm_state]
    # v_grad = state_grad.particle_v
    # x_grad = state_grad.particle_x
    v_grad = mpm_solver.mpm_state.particle_v.grad
    x_grad = mpm_solver.mpm_state.particle_x.grad
    # E_grad = wp.to_torch(tape.gradients[E_warp])
    print(x_grad)
    from IPython import embed; embed()


@wp.kernel
def g2p_test(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = new_v + grid_v * weight
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        state.particle_v[p] = new_v
        # state.particle_x[p] = state.particle_x[p] + dt * new_v
        # state.particle_x[p] = state.particle_x[p] + dt * state.particle_v[p]
        wp.atomic_add(state.particle_x, p, dt * state.particle_v[p])
        state.particle_C[p] = new_C
        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        state.particle_F_trial[p] = F_tmp

        if model.update_cov_with_F:
            update_cov(state, p, new_F, dt)


@wp.kernel
def position_loss_kernel(mpm_state: MPMStateStruct, loss: wp.array(dtype=float)):

    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    wp.atomic_add(loss, 0, pos[0] + pos[1] + pos[2])
    # wp.atomic_add(loss, 0, mpm_state.particle_x[tid][0])

@wp.kernel
def position_loss_kernel_raw(particle_x: wp.array(dtype=wp.vec3), loss: wp.array(dtype=float)):

    tid = wp.tid()

    pos = particle_x[tid]
    wp.atomic_add(loss, 0, pos[0] + pos[1] + pos[2])
    # wp.atomic_add(loss, 0, mpm_state.particle_x[tid][0])


if __name__ == "__main__":
    Fire(test)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/solver_grad_test.py
================================================
import warp as wp
import numpy as np
import torch
import os
from mpm_solver_warp_diff import MPM_Simulator_WARPDiff
from run_gaussian_static import load_gaussians, get_volume
from tqdm import tqdm
from fire import Fire

from diff_warp_utils import MPMStateStruct, MPMModelStruct
from warp_rewrite import MyTape

from mpm_utils import *
import random


def test(input_dir, output_dir=None, fps=6, device=0):
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    wp.init()
    wp.config.verify_cuda = True

    device = "cuda:{}".format(device)

    gaussian_dict, scale, shift = load_gaussians(input_dir)

    velocity_scaling = 0.5
    init_velocity = velocity_scaling * gaussian_dict["velocity"]
    init_position = gaussian_dict["position"]
    init_cov = gaussian_dict["cov"]

    volume_array_path = os.path.join(input_dir, "volume_array.npy")
    if os.path.exists(volume_array_path):
        volume_array = np.load(volume_array_path)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)
    else:
        volume_array = get_volume(init_position)
        np.save(volume_array_path, volume_array)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)

    tensor_init_pos = torch.from_numpy(init_position).float().to(device)
    tensor_init_cov = torch.from_numpy(init_cov).float().to(device)
    tensor_init_velocity = torch.from_numpy(init_velocity).float().to(device)

    material_params = {
        "E": 2.0,  # 0.1-200 MPa
        "nu": 0.1,  # > 0.35
        "material": "jelly",
        # "material": "metal",
        # "friction_angle": 25,
        "g": [0.0, 0.0, 0],
        "density": 0.02,  # kg / m^3
    }

    n_particles = tensor_init_pos.shape[0]
    mpm_state = MPMStateStruct()

    mpm_state.init(init_position.shape[0], device=device, requires_grad=True)
    mpm_state.from_torch(
        tensor_init_pos,
        volume_tensor,
        tensor_init_cov,
        tensor_init_velocity,
        device=device,
        requires_grad=True,
        n_grid=100,
        grid_lim=1.0,
    )

    mpm_model = MPMModelStruct()
    mpm_model.init(n_particles, device=device, requires_grad=True)
    mpm_model.init_other_params(n_grid=100, grid_lim=1.0, device=device)

    E_tensor = (torch.ones(n_particles) * material_params["E"]).contiguous().to(device)
    nu_tensor = (
        (torch.ones(n_particles) * material_params["nu"]).contiguous().to(device)
    )
    mpm_model.from_torch(E_tensor, nu_tensor, device=device, requires_grad=True)

    mpm_solver = MPM_Simulator_WARPDiff(
        n_particles, n_grid=100, grid_lim=1.0, device=device
    )

    mpm_solver.set_parameters_dict(mpm_model, mpm_state, material_params)

    # set boundary conditions
    static_center_point = (
        torch.from_numpy(gaussian_dict["satic_center_point"]).float().to(device)
    )
    max_static_offset = (
        torch.from_numpy(gaussian_dict["max_static_offset"]).float().to(device)
    )
    velocity = torch.zeros_like(static_center_point)
    mpm_solver.enforce_particle_velocity_translation(
        mpm_state,
        static_center_point,
        max_static_offset,
        velocity,
        start_time=0,
        end_time=1000,
        device=device,
    )

    mpm_state.set_require_grad(True)

    total_time = 0.1
    time_step = 0.001
    total_iters = int(total_time / time_step)
    total_iters = 3
    loss = torch.zeros(1, device=device)
    loss = wp.from_torch(loss, requires_grad=True)

    dt = time_step
    tape = MyTape()  # wp.Tape()

    with tape:
        # for k in tqdm(range(1, total_iters)):
        k = 1
        # mpm_solver.p2g2p(k, time_step, device=device)
        for k in range(10):
            mpm_solver.p2g2p(mpm_model, mpm_state, k, time_step, device=device)

        wp.launch(
            position_loss_kernel,
            dim=n_particles,
            inputs=[mpm_state, loss],
            device=device,
        )

    print(loss, "pre backward")

    tape.backward(loss)  # 75120.86

    print(loss)

    v_grad = mpm_state.particle_v.grad
    x_grad = mpm_state.particle_x.grad
    e_grad = mpm_model.E.grad
    e_grad_torch = wp.to_torch(e_grad)
    grid_v_grad = mpm_state.grid_v_out.grad
    grid_v_in_grad = mpm_state.grid_v_in.grad
    print(x_grad)
    from IPython import embed

    embed()


@wp.kernel
def position_loss_kernel(mpm_state: MPMStateStruct, loss: wp.array(dtype=float)):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    wp.atomic_add(loss, 0, pos[0] + pos[1] + pos[2])
    # wp.atomic_add(loss, 0, mpm_state.particle_x[tid][0])


if __name__ == "__main__":
    Fire(test)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/test_inverse_sim.py
================================================
import warp as wp
import numpy as np
import torch
import os
from mpm_solver_warp_diff import MPM_Simulator_WARPDiff
from run_gaussian_static import load_gaussians, get_volume
from tqdm import tqdm
from fire import Fire

from diff_warp_utils import MPMStateStruct, MPMModelStruct
from warp_rewrite import MyTape

from mpm_utils import *
import random
import pickle


def test(
    input_dir,
    pickle_path="output/E_0.2_nu_0.1_material_jelly_density_0.2__timestep_0.001_vs0.5_totaltime_10.pkl",
    output_dir=None,
    fps=6,
    device=0,
):
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    wp.init()
    wp.config.verify_cuda = True

    device = "cuda:{}".format(device)

    gaussian_dict, scale, shift = load_gaussians(input_dir)

    velocity_scaling = 0.5

    init_velocity = velocity_scaling * gaussian_dict["velocity"]
    init_position = gaussian_dict["position"]
    init_cov = gaussian_dict["cov"]

    volume_array_path = os.path.join(input_dir, "volume_array.npy")
    if os.path.exists(volume_array_path):
        volume_array = np.load(volume_array_path)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)
    else:
        volume_array = get_volume(init_position)
        np.save(volume_array_path, volume_array)
        volume_tensor = torch.from_numpy(volume_array).float().to(device)

    tensor_init_pos = torch.from_numpy(init_position).float().to(device)
    tensor_init_cov = torch.from_numpy(init_cov).float().to(device)
    tensor_init_velocity = torch.from_numpy(init_velocity).float().to(device)

    material_params = {
        "E": 2.5,  # 0.1-200 MPa
        "nu": 0.2,  # > 0.35
        "material": "jelly",
        # "material": "metal",
        # "friction_angle": 25,
        "g": [0.0, 0.0, 0],
        "density": 0.2,  # kg / m^3
    }

    n_particles = tensor_init_pos.shape[0]
    mpm_state = MPMStateStruct()

    mpm_state.init(init_position.shape[0], device=device, requires_grad=True)
    mpm_state.from_torch(
        tensor_init_pos.clone(),
        volume_tensor,
        tensor_init_cov,
        tensor_init_velocity.clone(),
        device=device,
        requires_grad=True,
        n_grid=100,
        grid_lim=1.0,
    )

    mpm_model = MPMModelStruct()
    mpm_model.init(n_particles, device=device, requires_grad=True)
    mpm_model.init_other_params(n_grid=100, grid_lim=1.0, device=device)

    E_tensor = (torch.ones(n_particles) * material_params["E"]).contiguous().to(device)
    nu_tensor = (
        (torch.ones(n_particles) * material_params["nu"]).contiguous().to(device)
    )
    mpm_model.from_torch(E_tensor, nu_tensor, device=device, requires_grad=True)

    mpm_solver = MPM_Simulator_WARPDiff(
        n_particles, n_grid=100, grid_lim=1.0, device=device
    )

    mpm_solver.set_parameters_dict(mpm_model, mpm_state, material_params)

    # set boundary conditions
    static_center_point = (
        torch.from_numpy(gaussian_dict["satic_center_point"]).float().to(device)
    )
    max_static_offset = (
        torch.from_numpy(gaussian_dict["max_static_offset"]).float().to(device)
    )
    velocity = torch.zeros_like(static_center_point)
    mpm_solver.enforce_particle_velocity_translation(
        mpm_state,
        static_center_point,
        max_static_offset,
        velocity,
        start_time=0,
        end_time=1000,
        device=device,
    )

    mpm_state.set_require_grad(True)

    total_time = 0.02
    time_step = 0.001
    total_iters = int(total_time / time_step)
    total_iters = 10

    dt = time_step
    with open(pickle_path, "rb") as f:
        gt_dict = pickle.load(f)

    sim_sub_step = 10
    gt_pos_numpy_list = gt_dict["pos_list"]
    pos_gt_1 = gt_pos_numpy_list[sim_sub_step - 1]
    pos_gt_1_warp = wp.from_numpy(
        pos_gt_1, dtype=wp.vec3, device=device, requires_grad=True
    )

    E_cur = material_params["E"]
    nu_cur = material_params["nu"]

    init_lr = 3e-6
    total_train_steps = 2000
    for train_step in range(total_train_steps):
        learning_rate = (
            (total_train_steps - train_step + 1) / total_train_steps * init_lr
        )
        tape = MyTape()  # wp.Tape()
        with tape:
            # for k in tqdm(range(1, total_iters)):
            k = 0
            mpm_solver.time = 0.0

            mpm_solver.set_E_nu(mpm_model, E_cur, nu_cur, device=device)
            for k in range(sim_sub_step):
                mpm_solver.p2g2p(mpm_model, mpm_state, k, time_step, device=device)

            loss = torch.zeros(1, device=device)
            loss = wp.from_torch(loss, requires_grad=True)
            wp.launch(
                position_loss_kernel,
                dim=n_particles,
                inputs=[mpm_state, pos_gt_1_warp, loss],
                device=device,
            )

        tape.backward(loss)  # 75120.86

        E_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)
        nu_grad = wp.from_torch(torch.zeros(1, device=device), requires_grad=False)

        wp.launch(
            aggregate_grad,
            dim=n_particles,
            inputs=[
                E_grad,
                mpm_model.E.grad,
            ],
            device=device,
        )
        wp.launch(
            aggregate_grad,
            dim=n_particles,
            inputs=[nu_grad, mpm_model.nu.grad],
            device=device,
        )

        E_cur = E_cur - wp.to_torch(E_grad).item() * learning_rate
        nu_cur = nu_cur - wp.to_torch(nu_grad).item() * learning_rate
        # clip:
        E_cur = max(1e-5, min(E_cur, 200))
        nu_cur = max(1e-2, min(nu_cur, 0.449))

        tape.zero()
        print(
            loss,
            "pre backward",
            E_cur,
            nu_cur,
            E_grad,
            nu_grad,
        )

        mpm_state.reset_state(
            tensor_init_pos.clone(),
            tensor_init_cov,
            tensor_init_velocity.clone(),
            device=device,
            requires_grad=True,
        )
        # might need to set mpm_model.yield_stress

    from IPython import embed  #     embed()


@wp.kernel
def position_loss_kernel(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    l2 = wp.length(pos - pos_gt)

    wp.atomic_add(loss, 0, l2)


@wp.kernel
def step_kernel(x: wp.array(dtype=float), grad: wp.array(dtype=float), alpha: float):
    tid = wp.tid()

    # gradient descent step
    x[tid] = x[tid] - grad[tid] * alpha


@wp.kernel
def aggregate_grad(x: wp.array(dtype=float), grad: wp.array(dtype=float)):
    tid = wp.tid()

    # gradient descent step
    wp.atomic_add(x, 0, grad[tid])


if __name__ == "__main__":
    Fire(test)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/test_sim.py
================================================
import warp as wp 
import numpy as np
import torch

@wp.struct
class MPMStateStruct:
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity

    particle_vol: wp.array(dtype=float)  # current volume
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    grid_v_out: wp.array(
        dtype=wp.vec3, ndim=3
    )  # grid node momentum/velocity, after grid update

class MPM_Simulator_WARPDiff:
    def __init__(self, x, v, vol, device):

        self.mpm_state = MPMStateStruct()
        self.mpm_state.particle_x = wp.array(x, dtype=wp.vec3, requires_grad=True, device=device)
        self.mpm_state.particle_v = wp.array(v, dtype=wp.vec3, requires_grad=True, device=device)
        self.mpm_state.particle_vol = wp.array(vol, dtype=float, requires_grad=False, device=device)
        self.mpm_state.particle_F = wp.array(np.zeros((100, 3, 3), dtype=np.float32), dtype=wp.mat33, requires_grad=True, device=device)
        self.mpm_state.grid_v_out = wp.array(np.zeros((100, 100, 100, 3), dtype=np.float32), dtype=wp.vec3, requires_grad=True, device=device)


@wp.kernel
def vec3_add(mpm_state: MPMStateStruct, selection: wp.array(dtype=wp.float32), dt: float):

    tid = wp.tid()

    # new_v = wp.vec3(1.0, 1.0, 1.0)
    # velocity[tid] = new_v
    velocity = mpm_state.particle_v
    if selection[tid] == 0: # no problem. static condition/loop no problem!
        for i in range(2):
            for j in range(2):
                # x[tid] = x[tid] + velocity[tid] * dt
                # x[tid] = wp.add(x[tid], velocity[tid]) # same as above. wrong gradient
                wp.atomic_add(mpm_state.particle_x, tid, velocity[tid] * mpm_state.particle_vol[tid])

                # x[tid] += velocity[tid] * dt # error, no support of +=
    
@wp.kernel
def loss_kernel(mpm_state: MPMStateStruct,  loss: wp.array(dtype=float)):

    tid = wp.tid()

    pos = mpm_state.particle_x[tid]

    wp.atomic_add(loss, 0, pos[0])


def main():

    wp.init()
    wp.config.verify_cuda = True

    device = 0
    device = "cuda:{}".format(device)


    x = np.random.rand(100, 3).astype(np.float32)
    velocity = np.random.rand(100, 3).astype(np.float32)
    dt = 0.1

    
    # mpm_state = MPMStateStruct()
    # mpm_state.particle_x = wp.array(x, device=device, dtype=wp.vec3,  requires_grad=True)
    # mpm_state.particle_v = wp.array(velocity, device=device, dtype=wp.vec3, requires_grad=True)
    # mpm_state.particle_vol = wp.full(shape=100, value=1, device=device, dtype=wp.float32, requires_grad=False)
    
    mpm_solver = MPM_Simulator_WARPDiff(x, velocity, np.ones(100, dtype=np.float32), device=device)
    
    selection = wp.zeros(100, device=device, dtype=wp.float32)

    loss = torch.zeros(1, device=device)
    loss = wp.from_torch(loss, requires_grad=True)
    tape = wp.Tape()

    with tape:
        for j in range(2):
            wp.launch(vec3_add, dim=100, inputs=[mpm_solver.mpm_state, selection, dt], device=device)
        wp.launch(loss_kernel, dim=100, inputs=[mpm_solver.mpm_state, loss], device=device)

    tape.backward(loss) 

    v_grad = mpm_solver.mpm_state.particle_v.grad
    x_grad = mpm_solver.mpm_state.particle_x.grad
    print(v_grad, loss)

    from IPython import embed; embed()

if __name__ == "__main__":  
    main()

================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/warp_rewrite.py
================================================
import warp as wp
import ctypes


from warp.torch import (
    dtype_from_torch,
    device_from_torch,
    dtype_is_compatible,
    from_torch,
)


def from_torch_safe(t, dtype=None, requires_grad=None, grad=None):
    """Wrap a PyTorch tensor to a Warp array without copying the data.

    Args:
        t (torch.Tensor): The torch tensor to wrap.
        dtype (warp.dtype, optional): The target data type of the resulting Warp array. Defaults to the tensor value type mapped to a Warp array value type.
        requires_grad (bool, optional): Whether the resulting array should wrap the tensor's gradient, if it exists (the grad tensor will be allocated otherwise). Defaults to the tensor's `requires_grad` value.

    Returns:
        warp.array: The wrapped array.
    """
    if dtype is None:
        dtype = dtype_from_torch(t.dtype)
    elif not dtype_is_compatible(t.dtype, dtype):
        raise RuntimeError(f"Incompatible data types: {t.dtype} and {dtype}")

    # get size of underlying data type to compute strides
    ctype_size = ctypes.sizeof(dtype._type_)

    shape = tuple(t.shape)
    strides = tuple(s * ctype_size for s in t.stride())

    # if target is a vector or matrix type
    # then check if trailing dimensions match
    # the target type and update the shape
    if hasattr(dtype, "_shape_"):
        dtype_shape = dtype._shape_
        dtype_dims = len(dtype._shape_)
        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
            raise RuntimeError(
                f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
            )

        # ensure the inner strides are contiguous
        stride = ctype_size
        for i in range(dtype_dims):
            if strides[-i - 1] != stride:
                raise RuntimeError(
                    f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
                )
            stride *= dtype_shape[-i - 1]

        shape = tuple(shape[:-dtype_dims]) or (1,)
        strides = tuple(strides[:-dtype_dims]) or (ctype_size,)

    requires_grad = t.requires_grad if requires_grad is None else requires_grad
    if grad is not None:
        if not isinstance(grad, wp.array):
            import torch

            if isinstance(grad, torch.Tensor):
                grad = from_torch(grad, dtype=dtype)
            else:
                raise ValueError(f"Invalid gradient type: {type(grad)}")
    elif requires_grad:
        # wrap the tensor gradient, allocate if necessary
        if t.grad is None:
            # allocate a zero-filled gradient tensor if it doesn't exist
            import torch

            t.grad = torch.zeros_like(t, requires_grad=False)
        grad = from_torch(t.grad, dtype=dtype)

    a = wp.types.array(
        ptr=t.data_ptr(),
        dtype=dtype,
        shape=shape,
        strides=strides,
        device=device_from_torch(t.device),
        copy=False,
        owner=False,
        grad=grad,
        requires_grad=requires_grad,
    )

    # save a reference to the source tensor, otherwise it will be deallocated
    a._tensor = t
    return a


class MyTape(wp.Tape):
    # returns the adjoint of a kernel parameter
    def get_adjoint(self, a):
        if not wp.types.is_array(a) and not isinstance(a, wp.codegen.StructInstance):
            # if input is a simple type (e.g.: float, vec3, etc) then
            # no gradient needed (we only return gradients through arrays and structs)
            return a

        elif wp.types.is_array(a) and a.grad:
            # keep track of all gradients used by the tape (for zeroing)
            # ignore the scalar loss since we don't want to clear its grad
            self.gradients[a] = a.grad
            return a.grad

        elif isinstance(a, wp.codegen.StructInstance):
            adj = a._cls()
            for name, _ in a._cls.ctype._fields_:
                if name.startswith("_"):
                    continue
                if isinstance(a._cls.vars[name].type, wp.array):
                    arr = getattr(a, name)
                    if arr is None:
                        continue
                    if arr.grad:
                        grad = self.gradients[arr] = arr.grad
                    else:
                        grad = wp.zeros_like(arr)
                    setattr(adj, name, grad)
                else:
                    setattr(adj, name, getattr(a, name))

            self.gradients[a] = adj
            return adj

        return None


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup/warp_utils.py
================================================
import warp as wp
import warp.torch
import torch


@wp.struct
class MPMModelStruct:
    ####### essential #######
    grid_lim: float
    n_particles: int
    n_grid: int
    dx: float
    inv_dx: float
    grid_dim_x: int
    grid_dim_y: int
    grid_dim_z: int
    mu: wp.array(dtype=float)
    lam: wp.array(dtype=float)
    E: wp.array(dtype=float)
    nu: wp.array(dtype=float)
    material: int

    ######## for plasticity ####
    yield_stress: wp.array(dtype=float)
    friction_angle: float
    alpha: float
    gravitational_accelaration: wp.vec3
    hardening: float
    xi: float
    plastic_viscosity: float
    softening: float

    ####### for damping
    rpic_damping: float
    grid_v_damping_scale: float

    ####### for PhysGaussian: covariance
    update_cov_with_F: int


@wp.struct
class MPMStateStruct:
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_init_cov: wp.array(dtype=float)  # initial covariance matrix
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_R: wp.array(dtype=wp.mat33)  # rotation matrix
    particle_stress: wp.array(dtype=wp.mat33)  # Kirchoff stress, elastic stress
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume
    particle_mass: wp.array(dtype=float)  # mass
    particle_density: wp.array(dtype=float)  # density
    particle_Jp: wp.array(dtype=float)

    particle_selection: wp.array(dtype=int) # only particle_selection[p] = 0 will be simulated

    # grid
    grid_m: wp.array(dtype=float, ndim=3)
    grid_v_in: wp.array(dtype=wp.vec3, ndim=3)  # grid node momentum/velocity
    grid_v_out: wp.array(
        dtype=wp.vec3, ndim=3
    )  # grid node momentum/velocity, after grid update


# for various boundary conditions
@wp.struct
class Dirichlet_collider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float

    friction: float
    surface_type: int

    velocity: wp.vec3

    threshold: float
    reset: int
    index: int

    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    height: float
    length: float
    R: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3
    half_height_and_radius: wp.vec2
    

@wp.struct
class Impulse_modifier:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    force: wp.vec3
    forceTimesDt: wp.vec3
    numsteps: int

    point: wp.vec3
    size: wp.vec3
    mask: wp.array(dtype=int)


@wp.struct
class MPMtailoredStruct:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    friction: float
    surface_type: int
    velocity: wp.vec3
    threshold: float
    reset: int

    point_rotate: wp.vec3
    normal_rotate: wp.vec3
    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    point_plane: wp.vec3
    normal_plane: wp.vec3
    velocity_plane: wp.vec3
    threshold_plane: float

@wp.struct
class MaterialParamsModifier:
    point: wp.vec3
    size: wp.vec3
    E: float
    nu: float
    density: float

@wp.struct
class ParticleVelocityModifier:
    point: wp.vec3
    normal: wp.vec3
    half_height_and_radius: wp.vec2
    rotation_scale: float
    translation_scale: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3
    
    start_time: float

    end_time: float

    velocity: wp.vec3

    mask: wp.array(dtype=int)


@wp.kernel
def set_vec3_to_zero(target_array: wp.array(dtype=wp.vec3)):
    tid = wp.tid()
    target_array[tid] = wp.vec3(0.0, 0.0, 0.0)


@wp.kernel
def set_mat33_to_identity(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)


@wp.kernel
def add_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.add(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def subtract_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.sub(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def add_vec3_to_vec3(
    first_array: wp.array(dtype=wp.vec3), second_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    first_array[tid] = wp.add(first_array[tid], second_array[tid])


@wp.kernel
def set_value_to_float_array(target_array: wp.array(dtype=float), value: float):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def set_warpvalue_to_float_array(target_array: wp.array(dtype=float), value: warp.types.float32):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def get_float_array_product(
    arrayA: wp.array(dtype=float),
    arrayB: wp.array(dtype=float),
    arrayC: wp.array(dtype=float),
):
    tid = wp.tid()
    arrayC[tid] = arrayA[tid] * arrayB[tid]


def torch2warp_quat(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 4
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.quat,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a

def torch2warp_float(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=warp.types.float32,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a

def torch2warp_vec3(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.vec3,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_mat33(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.mat33,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup_jan10/gaussian_sim_utils.py
================================================
import numpy as np

def get_volume(xyzs: np.ndarray, resolution=128) -> np.ndarray:

    # set a grid in the range of [-1, 1], with resolution
    voxel_counts = np.zeros((resolution, resolution, resolution))

    points_xyzindex = ((xyzs + 1) / 2 * (resolution - 1)).astype(np.uint32)
    cell_volume = (2.0 / (resolution - 1)) ** 3

    for x, y, z in points_xyzindex:
        voxel_counts[x, y, z] += 1

    points_number_in_corresponding_voxel = voxel_counts[
        points_xyzindex[:, 0], points_xyzindex[:, 1], points_xyzindex[:, 2]
    ]

    points_volume = cell_volume / points_number_in_corresponding_voxel

    points_volume = points_volume.astype(np.float32)

    return points_volume


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup_jan10/mpm_data_structure.py
================================================
import warp as wp
import warp.torch
import torch
from typing import Optional, Union, Sequence, Any
from torch import Tensor
import os 
import sys
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from warp_utils import from_torch_safe


@wp.struct
class MPMStateStruct(object):
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_init_cov: wp.array(dtype=float)  # initial covariance matrix
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_R: wp.array(dtype=wp.mat33)  # rotation matrix
    particle_stress: wp.array(dtype=wp.mat33)  # Kirchoff stress, elastic stress
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume
    particle_mass: wp.array(dtype=float)  # mass
    particle_density: wp.array(dtype=float)  # density
    particle_Jp: wp.array(dtype=float)

    particle_selection: wp.array(
        dtype=int
    )  # only particle_selection[p] = 0 will be simulated

    # grid
    grid_m: wp.array(dtype=float, ndim=3)
    grid_v_in: wp.array(dtype=wp.vec3, ndim=3)  # grid node momentum/velocity
    grid_v_out: wp.array(
        dtype=wp.vec3, ndim=3
    )  # grid node momentum/velocity, after grid update

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        # shape default is int. number of particles
        self.particle_x = wp.empty(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_v = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_F = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_init_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=False
        )
        self.particle_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=False
        )

        self.particle_F_trial = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_R = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_stress = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_C = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_vol = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_mass = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_density = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_Jp = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.particle_selection = wp.zeros(
            shape, dtype=int, device=device, requires_grad=requires_grad
        )

        # grid: will init later
        self.grid_m = wp.empty(
            (10, 10, 10), dtype=float, device=device, requires_grad=requires_grad
        )
        self.grid_v_in = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.grid_v_out = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )

    def init_grid(
        self, grid_res: int, device: wp.context.Devicelike = None, requires_grad=False
    ):
        self.grid_m = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=float,
            device=device,
            requires_grad=False,
        )
        self.grid_v_in = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )
        self.grid_v_out = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )

    def from_torch(
        self,
        tensor_x: Tensor,
        tensor_volume: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        n_grid: int = 100,
        grid_lim=1.0,
        device="cuda:0",
        requires_grad=True,
    ):
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]
        self.init_grid(grid_res=n_grid, device=device, requires_grad=requires_grad)

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_volume is not None:
            print(self.particle_vol.shape, tensor_volume.shape)
            volume_numpy = tensor_volume.detach().cpu().numpy()
            self.particle_vol = wp.from_numpy(
                volume_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )
            self.particle_init_cov = self.particle_cov

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", n_particles)

    def reset_state(
        self,
        tensor_x: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        tensor_density: Optional[Tensor] = None,
        selection_mask: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
    ):
        # reset p_c, p_v, p_C, p_F_trial
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]

        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )
            self.particle_cov = self.particle_init_cov

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )
        
        if tensor_density is not None and selection_mask is not None:
            wp_density = from_torch_safe(
                tensor_density.contiguous().detach().clone(),
                dtype=wp.float32,
                requires_grad=False,
            )
            # 1 indicate we need to simulate this particle
            wp_selection_mask = from_torch_safe(
                selection_mask.contiguous().detach().clone().type(torch.int),
                dtype=wp.int32,
                requires_grad=False,)

            wp.launch(
                kernel=set_float_vec_to_vec_wmask,
                dim=n_particles,
                inputs=[self.particle_density, wp_density, wp_selection_mask],
                device=device,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_C],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_stress],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_R],
            device=device,
        )
    def set_require_grad(self, requires_grad=True):
        self.particle_x.requires_grad = requires_grad
        self.particle_v.requires_grad = requires_grad
        self.particle_F.requires_grad = requires_grad
        self.particle_F_trial.requires_grad = requires_grad
        self.particle_stress.requires_grad = requires_grad

        self.grid_v_out.requires_grad = requires_grad
        self.grid_v_in.requires_grad = requires_grad

    def reset_density(self, tensor_density: Tensor,
        selection_mask: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,):

        n_particles = tensor_density.shape[0]
        if tensor_density is not None and selection_mask is not None:
            wp_density = from_torch_safe(
                tensor_density.contiguous().detach().clone(),
                dtype=wp.float32,
                requires_grad=False,
            )
            # 1 indicate we need to simulate this particle
            wp_selection_mask = from_torch_safe(
                selection_mask.contiguous().detach().clone().type(torch.int),
                dtype=wp.int32,
                requires_grad=False,)

            wp.launch(
                kernel=set_float_vec_to_vec_wmask,
                dim=n_particles,
                inputs=[self.particle_density, wp_density, wp_selection_mask],
                device=device,
            )


@wp.struct
class ParticleStateStruct(object):
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_init_cov: wp.array(dtype=float)  # initial covariance matrix
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume

    particle_selection: wp.array(
        dtype=int
    )  # only particle_selection[p] = 0 will be simulated

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        # shape default is int. number of particles
        self.particle_x = wp.empty(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_v = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_F = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_init_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=requires_grad
        )
        self.particle_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=requires_grad
        )

        self.particle_F_trial = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_stress = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_C = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_vol = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.particle_selection = wp.zeros(
            shape, dtype=int, device=device, requires_grad=requires_grad
        )

    def from_torch(
        self,
        tensor_x: Tensor,
        tensor_volume: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        n_grid: int = 100,
        grid_lim=1.0,
        device="cuda:0",
        requires_grad=True,
    ):
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]

        if tensor_x is not None:
            # print(self.particle_x.shape, tensor_x.shape)
            # print(tensor_x.grad)
            if tensor_x.requires_grad:
                # tensor_x.grad = torch.zeros_like(tensor_x, requires_grad=False)
                raise RuntimeError("tensor_x requires grad")

            # x_numpy = tensor_x.detach().clone().cpu().numpy()
            # self.particle_x = wp.from_numpy(x_numpy, dtype=wp.vec3, requires_grad=True, device=device)
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_volume is not None:
            print(self.particle_vol.shape, tensor_volume.shape)
            volume_numpy = tensor_volume.detach().cpu().numpy()
            # self.particle_vol = wp.from_torch(tensor_volume.contiguous(), dtype=float, device=device, requires_grad=requires_grad)
            # self.particle_vol = wp.from_torch(tensor_volume.contiguous(), dtype=float, requires_grad=False)
            self.particle_vol = wp.from_numpy(
                volume_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )
            self.particle_cov = self.particle_init_cov

        if tensor_velocity is not None:
            if tensor_velocity.requires_grad:
                tensor_velocity.grad = torch.zeros_like(
                    tensor_velocity, requires_grad=False
                )
                raise RuntimeError("tensor_x requires grad")
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", n_particles)

    def set_require_grad(self, requires_grad=True):
        self.particle_x.requires_grad = requires_grad
        self.particle_v.requires_grad = requires_grad
        self.particle_F.requires_grad = requires_grad
        self.particle_F_trial.requires_grad = requires_grad
        self.particle_stress.requires_grad = requires_grad


@wp.struct
class MPMModelStruct(object):
    ####### essential #######
    grid_lim: float
    n_particles: int
    n_grid: int
    dx: float
    inv_dx: float
    grid_dim_x: int
    grid_dim_y: int
    grid_dim_z: int
    mu: wp.array(dtype=float)
    lam: wp.array(dtype=float)
    E: wp.array(dtype=float)
    nu: wp.array(dtype=float)
    material: int

    ######## for plasticity ####
    yield_stress: wp.array(dtype=float)
    friction_angle: float
    alpha: float
    gravitational_accelaration: wp.vec3
    hardening: float
    xi: float
    plastic_viscosity: float
    softening: float

    ####### for damping
    rpic_damping: float
    grid_v_damping_scale: float

    ####### for PhysGaussian: covariance
    update_cov_with_F: int

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        self.E = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # young's modulus
        self.nu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # poisson's ratio

        self.mu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )
        self.lam = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.yield_stress = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

    def finalize_mu_lam(self, n_particles, device="cuda:0"):
        wp.launch(
            kernel=compute_mu_lam_from_E_nu_clean,
            dim=n_particles,
            inputs=[self.mu, self.lam, self.E, self.nu],
            device=device,
        )

    def init_other_params(self, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.grid_lim = grid_lim
        self.n_grid = n_grid
        self.grid_dim_x = n_grid
        self.grid_dim_y = n_grid
        self.grid_dim_z = n_grid
        (
            self.dx,
            self.inv_dx,
        ) = self.grid_lim / self.n_grid, float(
            n_grid / grid_lim
        )  # [0-1]?

        self.update_cov_with_F = False

        # material is used to switch between different elastoplastic models. 0 is jelly
        self.material = 0

        self.plastic_viscosity = 0.0
        self.softening = 0.1
        self.friction_angle = 25.0
        sin_phi = wp.sin(self.friction_angle / 180.0 * 3.14159265)
        self.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        self.gravitational_accelaration = wp.vec3(0.0, 0.0, 0.0)

        self.rpic_damping = 0.0  # 0.0 if no damping (apic). -1 if pic

        self.grid_v_damping_scale = 1.1  # globally applied

    def from_torch(
        self, tensor_E: Tensor, tensor_nu: Tensor, device="cuda:0", requires_grad=False
    ):
        self.E = wp.from_torch(tensor_E.contiguous(), requires_grad=requires_grad)
        self.nu = wp.from_torch(tensor_nu.contiguous(), requires_grad=requires_grad)
        n_particles = tensor_E.shape[0]
        self.finalize_mu_lam(n_particles=n_particles, device=device)

    def set_require_grad(self, requires_grad=True):
        self.E.requires_grad = requires_grad
        self.nu.requires_grad = requires_grad
        self.mu.requires_grad = requires_grad
        self.lam.requires_grad = requires_grad


# for various boundary conditions
@wp.struct
class Dirichlet_collider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float

    friction: float
    surface_type: int

    velocity: wp.vec3

    threshold: float
    reset: int
    index: int

    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    height: float
    length: float
    R: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3
    half_height_and_radius: wp.vec2


@wp.struct
class Impulse_modifier:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    force: wp.vec3
    forceTimesDt: wp.vec3
    numsteps: int

    point: wp.vec3
    size: wp.vec3
    mask: wp.array(dtype=int)


@wp.struct
class MPMtailoredStruct:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    friction: float
    surface_type: int
    velocity: wp.vec3
    threshold: float
    reset: int

    point_rotate: wp.vec3
    normal_rotate: wp.vec3
    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    point_plane: wp.vec3
    normal_plane: wp.vec3
    velocity_plane: wp.vec3
    threshold_plane: float


@wp.struct
class MaterialParamsModifier:
    point: wp.vec3
    size: wp.vec3
    E: float
    nu: float
    density: float


@wp.struct
class ParticleVelocityModifier:
    point: wp.vec3
    normal: wp.vec3
    half_height_and_radius: wp.vec2
    rotation_scale: float
    translation_scale: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3

    start_time: float

    end_time: float

    velocity: wp.vec3

    mask: wp.array(dtype=int)


@wp.kernel
def compute_mu_lam_from_E_nu_clean(
    mu: wp.array(dtype=float),
    lam: wp.array(dtype=float),
    E: wp.array(dtype=float),
    nu: wp.array(dtype=float),
):
    p = wp.tid()
    mu[p] = E[p] / (2.0 * (1.0 + nu[p]))
    lam[p] = E[p] * nu[p] / ((1.0 + nu[p]) * (1.0 - 2.0 * nu[p]))


@wp.kernel
def set_vec3_to_zero(target_array: wp.array(dtype=wp.vec3)):
    tid = wp.tid()
    target_array[tid] = wp.vec3(0.0, 0.0, 0.0)

@wp.kernel
def set_vec3_to_vec3(source_array: wp.array(dtype=wp.vec3), target_array: wp.array(dtype=wp.vec3)):
    tid = wp.tid()
    source_array[tid] = target_array[tid]

@wp.kernel
def set_float_vec_to_vec_wmask(source_array: wp.array(dtype=float), target_array: wp.array(dtype=float), selection_mask: wp.array(dtype=int)):
    tid = wp.tid()
    if selection_mask[tid] == 1:
        source_array[tid] = target_array[tid]

@wp.kernel
def set_float_vec_to_vec(source_array: wp.array(dtype=float), target_array: wp.array(dtype=float)):
    tid = wp.tid()
    source_array[tid] = target_array[tid]


@wp.kernel
def set_mat33_to_identity(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)


@wp.kernel
def set_mat33_to_zero(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


@wp.kernel
def add_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.add(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def subtract_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.sub(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def add_vec3_to_vec3(
    first_array: wp.array(dtype=wp.vec3), second_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    first_array[tid] = wp.add(first_array[tid], second_array[tid])


@wp.kernel
def set_value_to_float_array(target_array: wp.array(dtype=float), value: float):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def set_warpvalue_to_float_array(
    target_array: wp.array(dtype=float), value: warp.types.float32
):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def get_float_array_product(
    arrayA: wp.array(dtype=float),
    arrayB: wp.array(dtype=float),
    arrayC: wp.array(dtype=float),
):
    tid = wp.tid()
    arrayC[tid] = arrayA[tid] * arrayB[tid]


def torch2warp_quat(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 4
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.quat,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_float(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=warp.types.float32,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_vec3(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.vec3,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_mat33(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.mat33,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup_jan10/mpm_solver_diff.py
================================================
import sys
import os

import warp as wp

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from mpm_data_structure import *
from mpm_utils import *
from typing import Optional, Union, Sequence, Any


class MPMWARPDiff(object):
    # def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
    #     self.initialize(n_particles, n_grid, grid_lim, device=device)
    #     self.time_profile = {}

    def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.initialize(n_particles, n_grid, grid_lim, device=device)
        self.time_profile = {}

    def initialize(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.n_particles = n_particles

        self.time = 0.0

        self.grid_postprocess = []
        self.collider_params = []
        self.modify_bc = []

        self.tailored_struct_for_bc = MPMtailoredStruct()
        self.pre_p2g_operations = []
        self.impulse_params = []

        self.particle_velocity_modifiers = []
        self.particle_velocity_modifier_params = []

    # must give density. mass will be updated as density * volume
    def set_parameters(self, device="cuda:0", **kwargs):
        self.set_parameters_dict(device, kwargs)

    def set_parameters_dict(self, mpm_model, mpm_state, kwargs={}, device="cuda:0"):
        if "material" in kwargs:
            if kwargs["material"] == "jelly":
                mpm_model.material = 0
            elif kwargs["material"] == "metal":
                mpm_model.material = 1
            elif kwargs["material"] == "sand":
                mpm_model.material = 2
            elif kwargs["material"] == "foam":
                mpm_model.material = 3
            elif kwargs["material"] == "snow":
                mpm_model.material = 4
            elif kwargs["material"] == "plasticine":
                mpm_model.material = 5
            else:
                raise TypeError("Undefined material type")

        if "yield_stress" in kwargs:
            val = kwargs["yield_stress"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.yield_stress, val],
                device=device,
            )
        if "hardening" in kwargs:
            mpm_model.hardening = kwargs["hardening"]
        if "xi" in kwargs:
            mpm_model.xi = kwargs["xi"]
        if "friction_angle" in kwargs:
            mpm_model.friction_angle = kwargs["friction_angle"]
            sin_phi = wp.sin(mpm_model.friction_angle / 180.0 * 3.14159265)
            mpm_model.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        if "g" in kwargs:
            mpm_model.gravitational_accelaration = wp.vec3(
                kwargs["g"][0], kwargs["g"][1], kwargs["g"][2]
            )

        if "density" in kwargs:
            density_value = kwargs["density"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_state.particle_density, density_value],
                device=device,
            )
            wp.launch(
                kernel=get_float_array_product,
                dim=self.n_particles,
                inputs=[
                    mpm_state.particle_density,
                    mpm_state.particle_vol,
                    mpm_state.particle_mass,
                ],
                device=device,
            )
        if "rpic_damping" in kwargs:
            mpm_model.rpic_damping = kwargs["rpic_damping"]
        if "plastic_viscosity" in kwargs:
            mpm_model.plastic_viscosity = kwargs["plastic_viscosity"]
        if "softening" in kwargs:
            mpm_model.softening = kwargs["softening"]
        if "grid_v_damping_scale" in kwargs:
            mpm_model.grid_v_damping_scale = kwargs["grid_v_damping_scale"]

    def set_E_nu(self, mpm_model, E: float, nu: float, device="cuda:0"):

        if isinstance(E, float):
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.E, E],
                device=device,
            )
        else: # E is warp array
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=self.n_particles,
                inputs=[mpm_model.E, E],
                device=device,
            ) 

        if isinstance(nu, float):
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.nu, nu],
                device=device,
            )
        else:
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=self.n_particles,
                inputs=[mpm_model.nu, nu],
                device=device,
            )

    def p2g2p(self, mpm_model, mpm_state, step, dt, device="cuda:0"):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        # TODO, move this outside of the loop!!
        wp.launch(
            kernel=compute_mu_lam_from_E_nu,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )
        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        # apply impulse force on particles..
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, mpm_state, self.impulse_params[k]],
                device=device,
            )

        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        # F_trail => F                    # TODO: this is overite.. 
        # F, SVD(F), lam, mu => Stress.   # TODO: this is overite.. 
            
        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

        if mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid, collide
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        mpm_state,
                        mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        #### CFL check ####
        # particle_v = self.mpm_state.particle_v.numpy()
        # if np.max(np.abs(particle_v)) > self.mpm_model.dx / dt:
        #     print("max particle v: ", np.max(np.abs(particle_v)))
        #     print("max allowed  v: ", self.mpm_model.dx / dt)
        #     print("does not allow v*dt>dx")
        #     input()
        #### CFL check ####
        with wp.ScopedTimer(
            "clip_particle_x", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=clip_particle_x,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model],
                device=device,
            )

        self.time = self.time + dt

    def print_time_profile(self):
        print("MPM Time profile:")
        for key, value in self.time_profile.items():
            print(key, sum(value))

    # a surface specified by a point and the normal vector
    def add_surface_collider(
        self,
        point,
        normal,
        surface="sticky",
        friction=0.0,
        start_time=0.0,
        end_time=999.0,
    ):
        point = list(point)
        # Normalize normal
        normal_scale = 1.0 / wp.sqrt(float(sum(x**2 for x in normal)))
        normal = list(normal_scale * x for x in normal)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.normal = wp.vec3(normal[0], normal[1], normal[2])

        if surface == "sticky" and friction != 0:
            raise ValueError("friction must be 0 on sticky surfaces.")
        if surface == "sticky":
            collider_param.surface_type = 0
        elif surface == "slip":
            collider_param.surface_type = 1
        elif surface == "cut":
            collider_param.surface_type = 11
        else:
            collider_param.surface_type = 2
        # frictional
        collider_param.friction = friction

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                n = wp.vec3(param.normal[0], param.normal[1], param.normal[2])
                dotproduct = wp.dot(offset, n)

                if dotproduct < 0.0:
                    if param.surface_type == 0:
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )
                    elif param.surface_type == 11:
                        if (
                            float(grid_z) * model.dx < 0.4
                            or float(grid_z) * model.dx > 0.53
                        ):
                            state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                                0.0, 0.0, 0.0
                            )
                        else:
                            v_in = state.grid_v_out[grid_x, grid_y, grid_z]
                            state.grid_v_out[grid_x, grid_y, grid_z] = (
                                wp.vec3(v_in[0], 0.0, v_in[2]) * 0.3
                            )
                    else:
                        v = state.grid_v_out[grid_x, grid_y, grid_z]
                        normal_component = wp.dot(v, n)
                        if param.surface_type == 1:
                            v = (
                                v - normal_component * n
                            )  # Project out all normal component
                        else:
                            v = (
                                v - wp.min(normal_component, 0.0) * n
                            )  # Project out only inward normal component
                        if normal_component < 0.0 and wp.length(v) > 1e-20:
                            v = wp.max(
                                0.0, wp.length(v) + normal_component * param.friction
                            ) * wp.normalize(
                                v
                            )  # apply friction here
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # a cubiod is a rectangular cube'
    # centered at `point`
    # dimension is x: point[0]±size[0]
    #              y: point[1]±size[1]
    #              z: point[2]±size[2]
    # all grid nodes lie within the cubiod will have their speed set to velocity
    # the cuboid itself is also moving with const speed = velocity
    # set the speed to zero to fix BC
    def set_velocity_on_cuboid(
        self,
        point,
        size,
        velocity,
        start_time=0.0,
        end_time=999.0,
        reset=0,
    ):
        point = list(point)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time
        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.size = size
        collider_param.velocity = wp.vec3(velocity[0], velocity[1], velocity[2])
        # collider_param.threshold = threshold
        collider_param.reset = reset
        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                if (
                    wp.abs(offset[0]) < param.size[0]
                    and wp.abs(offset[1]) < param.size[1]
                    and wp.abs(offset[2]) < param.size[2]
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = param.velocity
            elif param.reset == 1:
                if time < param.end_time + 15.0 * dt:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        def modify(time, dt, param: Dirichlet_collider):
            if time >= param.start_time and time < param.end_time:
                param.point = wp.vec3(
                    param.point[0] + dt * param.velocity[0],
                    param.point[1] + dt * param.velocity[1],
                    param.point[2] + dt * param.velocity[2],
                )  # param.point + dt * param.velocity

        self.grid_postprocess.append(collide)
        self.modify_bc.append(modify)

    def add_bounding_box(self, start_time=0.0, end_time=999.0):
        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            padding = 3
            if time >= param.start_time and time < param.end_time:
                if grid_x < padding and state.grid_v_out[grid_x, grid_y, grid_z][0] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_x >= model.grid_dim_x - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][0] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_y < padding and state.grid_v_out[grid_x, grid_y, grid_z][1] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_y >= model.grid_dim_y - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][1] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_z < padding and state.grid_v_out[grid_x, grid_y, grid_z][2] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )
                if (
                    grid_z >= model.grid_dim_z - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][2] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles(
        self,
        mpm_state,
        force,
        dt,
        point=[1, 1, 1],
        size=[1, 1, 1],
        num_dt=1,
        start_time=0.0,
        device="cuda:0",
    ):
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = start_time + dt * num_dt

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])
        impulse_param.mask = wp.zeros(shape=self.n_particles, dtype=int, device=device)

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] == 1:
                    impulse = wp.vec3(
                        param.force[0] / state.particle_mass[p],
                        param.force[1] / state.particle_mass[p],
                        param.force[2] / state.particle_mass[p],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)

    def enforce_particle_velocity_translation(
        self, mpm_state, point, size, velocity, start_time, end_time, device="cuda:0"
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.size = wp.vec3(size[0], size[1], size[2])

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_translation,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # define a cylinder with center point, half_height, radius, normal
    # particles within the cylinder are rotating along the normal direction
    # may also have a translational velocity along the normal direction
    def enforce_particle_velocity_rotation(
        self,
        mpm_state,
        point,
        normal,
        half_height_and_radius,
        rotation_scale,
        translation_scale,
        start_time,
        end_time,
        device="cuda:0",
    ):
        normal_scale = 1.0 / wp.sqrt(
            float(normal[0] ** 2 + normal[1] ** 2 + normal[2] ** 2)
        )
        normal = list(normal_scale * x for x in normal)

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.half_height_and_radius = wp.vec2(
            half_height_and_radius[0], half_height_and_radius[1]
        )
        velocity_modifier_params.normal = wp.vec3(normal[0], normal[1], normal[2])

        horizontal_1 = wp.vec3(1.0, 1.0, 1.0)
        if wp.abs(wp.dot(velocity_modifier_params.normal, horizontal_1)) < 0.01:
            horizontal_1 = wp.vec3(0.72, 0.37, -0.67)
        horizontal_1 = (
            horizontal_1
            - wp.dot(horizontal_1, velocity_modifier_params.normal)
            * velocity_modifier_params.normal
        )
        horizontal_1 = horizontal_1 * (1.0 / wp.length(horizontal_1))
        horizontal_2 = wp.cross(horizontal_1, velocity_modifier_params.normal)

        velocity_modifier_params.horizontal_axis_1 = horizontal_1
        velocity_modifier_params.horizontal_axis_2 = horizontal_2

        velocity_modifier_params.rotation_scale = rotation_scale
        velocity_modifier_params.translation_scale = translation_scale

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_cylinder,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    offset = state.particle_x[p] - velocity_modifier_params.point
                    horizontal_distance = wp.length(
                        offset
                        - wp.dot(offset, velocity_modifier_params.normal)
                        * velocity_modifier_params.normal
                    )
                    cosine = (
                        wp.dot(offset, velocity_modifier_params.horizontal_axis_1)
                        / horizontal_distance
                    )
                    theta = wp.acos(cosine)
                    if wp.dot(offset, velocity_modifier_params.horizontal_axis_2) > 0:
                        theta = theta
                    else:
                        theta = -theta
                    axis1_scale = (
                        -horizontal_distance
                        * wp.sin(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis2_scale = (
                        horizontal_distance
                        * wp.cos(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis_vertical_scale = translation_scale
                    state.particle_v[p] = (
                        axis1_scale * velocity_modifier_params.horizontal_axis_1
                        + axis2_scale * velocity_modifier_params.horizontal_axis_2
                        + axis_vertical_scale * velocity_modifier_params.normal
                    )

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # given normal direction, say [0,0,1]
    # gradually release grid velocities from start position to end position
    def release_particles_sequentially(
        self, normal, start_position, end_position, num_layers, start_time, end_time
    ):
        num_layers = 50
        point = [0, 0, 0]
        size = [0, 0, 0]
        axis = -1
        for i in range(3):
            if normal[i] == 0:
                point[i] = 1
                size[i] = 1
            else:
                axis = i
                point[i] = end_position

        half_length_portion = wp.abs(start_position - end_position) / num_layers
        end_time_portion = end_time / num_layers
        for i in range(num_layers):
            size[axis] = half_length_portion * (num_layers - i)
            self.enforce_particle_velocity_translation(
                point=point,
                size=size,
                velocity=[0, 0, 0],
                start_time=start_time,
                end_time=end_time_portion * (i + 1),
            )

    def enforce_particle_velocity_by_mask(
        self,
        mpm_state,
        selection_mask: torch.Tensor,
        velocity,
        start_time,
        end_time,
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.from_torch(selection_mask)

        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    def restart_and_compute_F_C(self, mpm_model, mpm_state, target_pos, device):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        wp.launch(
            set_F_C_p2g,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model, target_pos],
            device=device,
        )

        wp.launch(
            kernel=grid_normalization_and_gravity,
            dim=(grid_size),
            inputs=[mpm_state, mpm_model, 0],
            device=device,
        )

        wp.launch(
            set_F_C_g2p,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # set position to target_pos
        wp.launch(
            kernel=set_vec3_to_vec3,
            dim=self.n_particles,
            inputs=[mpm_state.particle_x, target_pos],
            device=device,
        )

================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup_jan10/mpm_utils.py
================================================
import warp as wp
from mpm_data_structure import *
import numpy as np
import math


# compute stress from F
@wp.func
def kirchoff_stress_FCR(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, mu: float, lam: float
):
    # compute kirchoff stress for FCR model (remember tau = P F^T)
    R = U * wp.transpose(V)
    id = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    return 2.0 * mu * (F - R) * wp.transpose(F) + id * lam * J * (J - 1.0)


@wp.func
def kirchoff_stress_neoHookean(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, sig: wp.vec3, mu: float, lam: float
):
    # compute kirchoff stress for FCR model (remember tau = P F^T)
    b = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    b_hat = b - wp.vec3(
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
    )
    tau = mu * J ** (-2.0 / 3.0) * b_hat + lam / 2.0 * (J * J - 1.0) * wp.vec3(
        1.0, 1.0, 1.0
    )
    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_StVK(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    sig = wp.vec3(
        wp.max(sig[0], 0.01), wp.max(sig[1], 0.01), wp.max(sig[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    ONE = wp.vec3(1.0, 1.0, 1.0)
    tau = 2.0 * mu * epsilon + lam * log_sig_sum * ONE
    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_drucker_prager(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    center00 = 2.0 * mu * wp.log(sig[0]) * (1.0 / sig[0]) + lam * log_sig_sum * (
        1.0 / sig[0]
    )
    center11 = 2.0 * mu * wp.log(sig[1]) * (1.0 / sig[1]) + lam * log_sig_sum * (
        1.0 / sig[1]
    )
    center22 = 2.0 * mu * wp.log(sig[2]) * (1.0 / sig[2]) + lam * log_sig_sum * (
        1.0 / sig[2]
    )
    center = wp.mat33(center00, 0.0, 0.0, 0.0, center11, 0.0, 0.0, 0.0, center22)
    return U * center * wp.transpose(V) * wp.transpose(F)


@wp.func
def von_mises_return_mapping(F_trial: wp.mat33, model: MPMModelStruct, p: int):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


@wp.func
def von_mises_return_mapping_with_damage(
    F_trial: wp.mat33, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        if model.yield_stress[p] <= 0:
            return F_trial
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        model.yield_stress[p] = model.yield_stress[p] - model.softening * wp.length(
            (delta_gamma / epsilon_hat_norm) * epsilon_hat
        )
        if model.yield_stress[p] <= 0:
            model.mu[p] = 0.0
            model.lam[p] = 0.0
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


# for toothpaste
@wp.func
def viscoplasticity_return_mapping_with_StVK(
    F_trial: wp.mat33, model: MPMModelStruct, p: int, dt: float
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    b_trial = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    trace_epsilon = epsilon[0] + epsilon[1] + epsilon[2]
    epsilon_hat = epsilon - wp.vec3(
        trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
    )
    s_trial = 2.0 * model.mu[p] * epsilon_hat
    s_trial_norm = wp.length(s_trial)
    y = s_trial_norm - wp.sqrt(2.0 / 3.0) * model.yield_stress[p]
    if y > 0:
        mu_hat = model.mu[p] * (b_trial[0] + b_trial[1] + b_trial[2]) / 3.0
        s_new_norm = s_trial_norm - y / (
            1.0 + model.plastic_viscosity / (2.0 * mu_hat * dt)
        )
        s_new = (s_new_norm / s_trial_norm) * s_trial
        epsilon_new = 1.0 / (2.0 * model.mu[p]) * s_new + wp.vec3(
            trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
        )
        sig_elastic = wp.mat33(
            wp.exp(epsilon_new[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        return F_elastic
    else:
        return F_trial


@wp.func
def sand_return_mapping(
    F_trial: wp.mat33, state: MPMStateStruct, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig, V)

    epsilon = wp.vec3(
        wp.log(wp.max(wp.abs(sig[0]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[1]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[2]), 1e-14)),
    )
    sigma_out = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    tr = epsilon[0] + epsilon[1] + epsilon[2]  # + state.particle_Jp[p]
    epsilon_hat = epsilon - wp.vec3(tr / 3.0, tr / 3.0, tr / 3.0)
    epsilon_hat_norm = wp.length(epsilon_hat)
    delta_gamma = (
        epsilon_hat_norm
        + (3.0 * model.lam[p] + 2.0 * model.mu[p])
        / (2.0 * model.mu[p])
        * tr
        * model.alpha
    )

    if delta_gamma <= 0:
        F_elastic = F_trial

    if delta_gamma > 0 and tr > 0:
        F_elastic = U * wp.transpose(V)

    if delta_gamma > 0 and tr <= 0:
        H = epsilon - epsilon_hat * (delta_gamma / epsilon_hat_norm)
        s_new = wp.vec3(wp.exp(H[0]), wp.exp(H[1]), wp.exp(H[2]))

        F_elastic = U * wp.diag(s_new) * wp.transpose(V)
    return F_elastic


@wp.kernel
def compute_mu_lam_from_E_nu(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    model.mu[p] = model.E[p] / (2.0 * (1.0 + model.nu[p]))
    model.lam[p] = (
        model.E[p] * model.nu[p] / ((1.0 + model.nu[p]) * (1.0 - 2.0 * model.nu[p]))
    )


@wp.kernel
def zero_grid(state: MPMStateStruct, model: MPMModelStruct):
    grid_x, grid_y, grid_z = wp.tid()
    state.grid_m[grid_x, grid_y, grid_z] = 0.0
    state.grid_v_in[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)
    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)


@wp.func
def compute_dweight(
    model: MPMModelStruct, w: wp.mat33, dw: wp.mat33, i: int, j: int, k: int
):
    dweight = wp.vec3(
        dw[0, i] * w[1, j] * w[2, k],
        w[0, i] * dw[1, j] * w[2, k],
        w[0, i] * w[1, j] * dw[2, k],
    )
    return dweight * model.inv_dx


@wp.func
def update_cov(state: MPMStateStruct, p: int, grad_v: wp.mat33, dt: float):
    cov_n = wp.mat33(0.0)
    cov_n[0, 0] = state.particle_cov[p * 6]
    cov_n[0, 1] = state.particle_cov[p * 6 + 1]
    cov_n[0, 2] = state.particle_cov[p * 6 + 2]
    cov_n[1, 0] = state.particle_cov[p * 6 + 1]
    cov_n[1, 1] = state.particle_cov[p * 6 + 3]
    cov_n[1, 2] = state.particle_cov[p * 6 + 4]
    cov_n[2, 0] = state.particle_cov[p * 6 + 2]
    cov_n[2, 1] = state.particle_cov[p * 6 + 4]
    cov_n[2, 2] = state.particle_cov[p * 6 + 5]

    cov_np1 = cov_n + dt * (grad_v * cov_n + cov_n * wp.transpose(grad_v))

    state.particle_cov[p * 6] = cov_np1[0, 0]
    state.particle_cov[p * 6 + 1] = cov_np1[0, 1]
    state.particle_cov[p * 6 + 2] = cov_np1[0, 2]
    state.particle_cov[p * 6 + 3] = cov_np1[1, 1]
    state.particle_cov[p * 6 + 4] = cov_np1[1, 2]
    state.particle_cov[p * 6 + 5] = cov_np1[2, 2]


@wp.kernel
def p2g_apic_with_stress(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    # input given to p2g:   particle_stress
    #                       particle_x
    #                       particle_v
    #                       particle_C
    # output:               grid_v_in, grid_m
    p = wp.tid()
    if state.particle_selection[p] == 0:
        stress = state.particle_stress[p]
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    dpos = (
                        wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    ) * model.dx
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    dweight = compute_dweight(model, w, dw, i, j, k)

                    C = state.particle_C[p]
                    # if model.rpic = 0, standard apic
                    C = (1.0 - model.rpic_damping) * C + model.rpic_damping / 2.0 * (
                        C - wp.transpose(C)
                    )

                    # C = (1.0 - model.rpic_damping) * state.particle_C[
                    #     p
                    # ] + model.rpic_damping / 2.0 * (
                    #     state.particle_C[p] - wp.transpose(state.particle_C[p])
                    # )

                    if model.rpic_damping < -0.001:
                        # standard pic
                        C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                    elastic_force = -state.particle_vol[p] * stress * dweight
                    v_in_add = (
                        weight
                        * state.particle_mass[p]
                        * (state.particle_v[p] + C * dpos)
                        + dt * elastic_force
                    )
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


# add gravity
@wp.kernel
def grid_normalization_and_gravity(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    grid_x, grid_y, grid_z = wp.tid()
    if state.grid_m[grid_x, grid_y, grid_z] > 1e-15:
        v_out = state.grid_v_in[grid_x, grid_y, grid_z] * (
            1.0 / state.grid_m[grid_x, grid_y, grid_z]
        )
        # add gravity
        v_out = v_out + dt * model.gravitational_accelaration
        state.grid_v_out[grid_x, grid_y, grid_z] = v_out


@wp.kernel
def g2p(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = new_v + grid_v * weight
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        state.particle_v[p] = new_v
        # state.particle_x[p] = state.particle_x[p] + dt * new_v
        # state.particle_x[p] = state.particle_x[p] + dt * state.particle_v[p]

        # wp.atomic_add(state.particle_x, p, dt * state.particle_v[p]) # old one is this.. 
        wp.atomic_add(state.particle_x, p, dt * new_v) # debug
        # new_x = state.particle_x[p] + dt * state.particle_v[p]
        # state.particle_x[p] = new_x

        state.particle_C[p] = new_C

        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        state.particle_F_trial[p] = F_tmp
        # debug for jelly
        # wp.atomic_add(state.particle_F_trial, p, new_F * dt * state.particle_F[p])

        if model.update_cov_with_F:
            update_cov(state, p, new_F, dt)


@wp.kernel
def clip_particle_x(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    posx = state.particle_x[p]
    if state.particle_selection[p] == 0:
        dx = 1.0 / model.inv_dx
        a_min = dx * 2.0
        a_max = model.grid_lim - dx * 2.0
        new_x = wp.vec3(wp.clamp(posx[0], a_min, a_max),
                        wp.clamp(posx[1], a_min, a_max), 
                        wp.clamp(posx[2], a_min, a_max))

        delta_x = new_x - posx

        wp.atomic_add(state.particle_x, p, delta_x)


# compute (Kirchhoff) stress = stress(returnMap(F_trial))
@wp.kernel
def compute_stress_from_F_trial(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        # apply return mapping
        if model.material == 1:  # metal
            state.particle_F[p] = von_mises_return_mapping(
                state.particle_F_trial[p], model, p
            )
        elif model.material == 2:  # sand
            state.particle_F[p] = sand_return_mapping(
                state.particle_F_trial[p], state, model, p
            )
        elif model.material == 3:  # visplas, with StVk+VM, no thickening
            state.particle_F[p] = viscoplasticity_return_mapping_with_StVK(
                state.particle_F_trial[p], model, p, dt
            )
        elif model.material == 5:
            state.particle_F[p] = von_mises_return_mapping_with_damage(
                state.particle_F_trial[p], model, p
            )
        else:  # elastic, jelly
            state.particle_F[p] = state.particle_F_trial[p]

        # also compute stress here
        J = wp.determinant(state.particle_F[p])
        U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        sig = wp.vec3(0.0)
        stress = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        wp.svd3(state.particle_F[p], U, sig, V)
        if model.material == 0 or model.material == 5:
            stress = kirchoff_stress_FCR(
                state.particle_F[p], U, V, J, model.mu[p], model.lam[p]
            )
        if model.material == 1:
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 2:
            stress = kirchoff_stress_drucker_prager(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 3:
            # temporarily use stvk, subject to change
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )

        # stress = (stress + wp.transpose(stress)) / 2.0  # enfore symmetry
        state.particle_stress[p] = (stress + wp.transpose(stress)) / 2.0


@wp.kernel
def compute_cov_from_F(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    F = state.particle_F_trial[p]

    init_cov = wp.mat33(0.0)
    init_cov[0, 0] = state.particle_init_cov[p * 6]
    init_cov[0, 1] = state.particle_init_cov[p * 6 + 1]
    init_cov[0, 2] = state.particle_init_cov[p * 6 + 2]
    init_cov[1, 0] = state.particle_init_cov[p * 6 + 1]
    init_cov[1, 1] = state.particle_init_cov[p * 6 + 3]
    init_cov[1, 2] = state.particle_init_cov[p * 6 + 4]
    init_cov[2, 0] = state.particle_init_cov[p * 6 + 2]
    init_cov[2, 1] = state.particle_init_cov[p * 6 + 4]
    init_cov[2, 2] = state.particle_init_cov[p * 6 + 5]

    cov = F * init_cov * wp.transpose(F)

    state.particle_cov[p * 6] = cov[0, 0]
    state.particle_cov[p * 6 + 1] = cov[0, 1]
    state.particle_cov[p * 6 + 2] = cov[0, 2]
    state.particle_cov[p * 6 + 3] = cov[1, 1]
    state.particle_cov[p * 6 + 4] = cov[1, 2]
    state.particle_cov[p * 6 + 5] = cov[2, 2]


@wp.kernel
def compute_R_from_F(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    F = state.particle_F_trial[p]

    # polar svd decomposition
    U = wp.mat33(0.0)
    V = wp.mat33(0.0)
    sig = wp.vec3(0.0)
    wp.svd3(F, U, sig, V)

    if wp.determinant(U) < 0.0:
        U[0, 2] = -U[0, 2]
        U[1, 2] = -U[1, 2]
        U[2, 2] = -U[2, 2]

    if wp.determinant(V) < 0.0:
        V[0, 2] = -V[0, 2]
        V[1, 2] = -V[1, 2]
        V[2, 2] = -V[2, 2]

    # compute rotation matrix
    R = U * wp.transpose(V)
    state.particle_R[p] = wp.transpose(R)


@wp.kernel
def add_damping_via_grid(state: MPMStateStruct, scale: float):
    grid_x, grid_y, grid_z = wp.tid()
    # state.grid_v_out[grid_x, grid_y, grid_z] = (
    #     state.grid_v_out[grid_x, grid_y, grid_z] * scale
    # )
    wp.atomic_sub(state.grid_v_out, grid_x, grid_y, grid_z, (1.0 - scale) * state.grid_v_out[grid_x, grid_y, grid_z])


@wp.kernel
def apply_additional_params(
    state: MPMStateStruct,
    model: MPMModelStruct,
    params_modifier: MaterialParamsModifier,
):
    p = wp.tid()
    pos = state.particle_x[p]
    if (
        pos[0] > params_modifier.point[0] - params_modifier.size[0]
        and pos[0] < params_modifier.point[0] + params_modifier.size[0]
        and pos[1] > params_modifier.point[1] - params_modifier.size[1]
        and pos[1] < params_modifier.point[1] + params_modifier.size[1]
        and pos[2] > params_modifier.point[2] - params_modifier.size[2]
        and pos[2] < params_modifier.point[2] + params_modifier.size[2]
    ):
        model.E[p] = params_modifier.E
        model.nu[p] = params_modifier.nu
        state.particle_density[p] = params_modifier.density


@wp.kernel
def selection_add_impulse_on_particles(
    state: MPMStateStruct, impulse_modifier: Impulse_modifier
):
    p = wp.tid()
    offset = state.particle_x[p] - impulse_modifier.point
    if (
        wp.abs(offset[0]) < impulse_modifier.size[0]
        and wp.abs(offset[1]) < impulse_modifier.size[1]
        and wp.abs(offset[2]) < impulse_modifier.size[2]
    ):
        impulse_modifier.mask[p] = 1
    else:
        impulse_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_translation(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point
    if (
        wp.abs(offset[0]) < velocity_modifier.size[0]
        and wp.abs(offset[1]) < velocity_modifier.size[1]
        and wp.abs(offset[2]) < velocity_modifier.size[2]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_cylinder(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point

    vertical_distance = wp.abs(wp.dot(offset, velocity_modifier.normal))

    horizontal_distance = wp.length(
        offset - wp.dot(offset, velocity_modifier.normal) * velocity_modifier.normal
    )
    if (
        vertical_distance < velocity_modifier.half_height_and_radius[0]
        and horizontal_distance < velocity_modifier.half_height_and_radius[1]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0

@wp.kernel
def compute_position_l2_loss(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    l2 = wp.length(pos - pos_gt)

    wp.atomic_add(loss, 0, l2)

@wp.kernel
def aggregate_grad(x: wp.array(dtype=float), grad: wp.array(dtype=float)):
    tid = wp.tid()

    # gradient descent step
    wp.atomic_add(x, 0, grad[tid])


@wp.kernel
def set_F_C_p2g(state: MPMStateStruct, model: MPMModelStruct, target_pos: wp.array(dtype=wp.vec3)):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        # p2g for displacement
        particle_disp = target_pos[p] - state.particle_x[p]
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    v_in_add = weight * state.particle_mass[p] * particle_disp
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


@wp.kernel
def set_F_C_g2p(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        # g2p for C and F
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        
        # C should still be zero..
        # state.particle_C[p] = new_C        
        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F)
        state.particle_F_trial[p] = F_tmp

        if model.update_cov_with_F:
            update_cov(state, p, new_F, 1.0)


@wp.kernel
def compute_posloss_with_grad(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    grad: wp.array(dtype=wp.vec3),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    l2 = wp.length(pos - (pos_gt - grad[tid] * dt))

    wp.atomic_add(loss, 0, l2)

================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/backup_jan10/warp_utils.py
================================================
import warp as wp
import ctypes
from typing import Optional

from warp.torch import (
    dtype_from_torch,
    device_from_torch,
    dtype_is_compatible,
    from_torch,
)


def from_torch_safe(t, dtype=None, requires_grad=None, grad=None):
    """Wrap a PyTorch tensor to a Warp array without copying the data.

    Args:
        t (torch.Tensor): The torch tensor to wrap.
        dtype (warp.dtype, optional): The target data type of the resulting Warp array. Defaults to the tensor value type mapped to a Warp array value type.
        requires_grad (bool, optional): Whether the resulting array should wrap the tensor's gradient, if it exists (the grad tensor will be allocated otherwise). Defaults to the tensor's `requires_grad` value.

    Returns:
        warp.array: The wrapped array.
    """
    if dtype is None:
        dtype = dtype_from_torch(t.dtype)
    elif not dtype_is_compatible(t.dtype, dtype):
        raise RuntimeError(f"Incompatible data types: {t.dtype} and {dtype}")

    # get size of underlying data type to compute strides
    ctype_size = ctypes.sizeof(dtype._type_)

    shape = tuple(t.shape)
    strides = tuple(s * ctype_size for s in t.stride())

    # if target is a vector or matrix type
    # then check if trailing dimensions match
    # the target type and update the shape
    if hasattr(dtype, "_shape_"):
        dtype_shape = dtype._shape_
        dtype_dims = len(dtype._shape_)
        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
            raise RuntimeError(
                f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
            )

        # ensure the inner strides are contiguous
        stride = ctype_size
        for i in range(dtype_dims):
            if strides[-i - 1] != stride:
                raise RuntimeError(
                    f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
                )
            stride *= dtype_shape[-i - 1]

        shape = tuple(shape[:-dtype_dims]) or (1,)
        strides = tuple(strides[:-dtype_dims]) or (ctype_size,)

    requires_grad = t.requires_grad if requires_grad is None else requires_grad
    if grad is not None:
        if not isinstance(grad, wp.array):
            import torch

            if isinstance(grad, torch.Tensor):
                grad = from_torch(grad, dtype=dtype)
            else:
                raise ValueError(f"Invalid gradient type: {type(grad)}")
    elif requires_grad:
        # wrap the tensor gradient, allocate if necessary
        if t.grad is None:
            # allocate a zero-filled gradient tensor if it doesn't exist
            import torch

            t.grad = torch.zeros_like(t, requires_grad=False)
        grad = from_torch(t.grad, dtype=dtype)

    a = wp.types.array(
        ptr=t.data_ptr(),
        dtype=dtype,
        shape=shape,
        strides=strides,
        device=device_from_torch(t.device),
        copy=False,
        owner=False,
        grad=grad,
        requires_grad=requires_grad,
    )

    # save a reference to the source tensor, otherwise it will be deallocated
    a._tensor = t
    return a


class MyTape(wp.Tape):
    # returns the adjoint of a kernel parameter
    def get_adjoint(self, a):
        if not wp.types.is_array(a) and not isinstance(a, wp.codegen.StructInstance):
            # if input is a simple type (e.g.: float, vec3, etc) then
            # no gradient needed (we only return gradients through arrays and structs)
            return a

        elif wp.types.is_array(a) and a.grad:
            # keep track of all gradients used by the tape (for zeroing)
            # ignore the scalar loss since we don't want to clear its grad
            self.gradients[a] = a.grad
            return a.grad

        elif isinstance(a, wp.codegen.StructInstance):
            adj = a._cls()
            for name, _ in a._cls.ctype._fields_:
                if name.startswith("_"):
                    continue
                if isinstance(a._cls.vars[name].type, wp.array):
                    arr = getattr(a, name)
                    if arr is None:
                        continue
                    if arr.grad:
                        grad = self.gradients[arr] = arr.grad
                    else:
                        grad = wp.zeros_like(arr)
                    setattr(adj, name, grad)
                else:
                    setattr(adj, name, getattr(a, name))

            self.gradients[a] = adj
            return adj

        return None


# from https://github.com/PingchuanMa/NCLaw/blob/main/nclaw/warp/tape.py
class CondTape(object):
    def __init__(self, tape: Optional[MyTape], cond: bool = True) -> None:
        self.tape = tape
        self.cond = cond

    def __enter__(self):
        if self.tape is not None and self.cond:
            self.tape.__enter__()

    def __exit__(self, exc_type, exc_value, traceback):
        if self.tape is not None and self.cond:
            self.tape.__exit__(exc_type, exc_value, traceback)

================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/gaussian_sim_utils.py
================================================
import numpy as np


def get_volume(xyzs: np.ndarray, resolution=128) -> np.ndarray:

    # set a grid in the range of [-1, 1], with resolution
    voxel_counts = np.zeros((resolution, resolution, resolution))

    points_xyzindex = ((xyzs + 1) / 2 * (resolution - 1)).astype(np.uint32)
    cell_volume = (2.0 / (resolution - 1)) ** 3

    for x, y, z in points_xyzindex:
        voxel_counts[x, y, z] += 1

    points_number_in_corresponding_voxel = voxel_counts[
        points_xyzindex[:, 0], points_xyzindex[:, 1], points_xyzindex[:, 2]
    ]

    points_volume = cell_volume / points_number_in_corresponding_voxel

    points_volume = points_volume.astype(np.float32)

    # some statistics
    num_non_empyt_voxels = np.sum(voxel_counts > 0)
    max_points_in_voxel = np.max(voxel_counts)
    min_points_in_voxel = np.min(voxel_counts)
    avg_points_in_voxel = np.sum(voxel_counts) / num_non_empyt_voxels
    print("Number of non-empty voxels: ", num_non_empyt_voxels)
    print("Max points in voxel: ", max_points_in_voxel)
    print("Min points in voxel: ", min_points_in_voxel)
    print("Avg points in voxel: ", avg_points_in_voxel)

    return points_volume


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/mpm_data_structure.py
================================================
import warp as wp
import warp.torch
import torch
from typing import Optional, Union, Sequence, Any
from torch import Tensor
import os
import sys

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from warp_utils import from_torch_safe


@wp.struct
class MPMStateStruct(object):
    ###### essential #####
    # particle
    particle_x: wp.array(dtype=wp.vec3)  # current position
    particle_v: wp.array(dtype=wp.vec3)  # particle velocity
    particle_F: wp.array(dtype=wp.mat33)  # particle elastic deformation gradient
    particle_cov: wp.array(dtype=float)  # current covariance matrix
    particle_F_trial: wp.array(
        dtype=wp.mat33
    )  # apply return mapping on this to obtain elastic def grad
    particle_stress: wp.array(dtype=wp.mat33)  # Kirchoff stress, elastic stress
    particle_C: wp.array(dtype=wp.mat33)
    particle_vol: wp.array(dtype=float)  # current volume
    particle_mass: wp.array(dtype=float)  # mass
    particle_density: wp.array(dtype=float)  # density

    particle_selection: wp.array(
        dtype=int
    )  # only particle_selection[p] = 0 will be simulated

    # grid
    grid_m: wp.array(dtype=float, ndim=3)
    grid_v_in: wp.array(dtype=wp.vec3, ndim=3)  # grid node momentum/velocity
    grid_v_out: wp.array(
        dtype=wp.vec3, ndim=3
    )  # grid node momentum/velocity, after grid update

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        # shape default is int. number of particles
        self.particle_x = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_v = wp.zeros(
            shape, dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.particle_F = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_cov = wp.zeros(
            shape * 6, dtype=float, device=device, requires_grad=False
        )

        self.particle_F_trial = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_stress = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )
        self.particle_C = wp.zeros(
            shape, dtype=wp.mat33, device=device, requires_grad=requires_grad
        )

        self.particle_vol = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_mass = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )
        self.particle_density = wp.zeros(
            shape, dtype=float, device=device, requires_grad=False
        )

        self.particle_selection = wp.zeros(
            shape, dtype=int, device=device, requires_grad=False
        )

        # grid: will init later
        self.grid_m = wp.zeros(
            (10, 10, 10), dtype=float, device=device, requires_grad=requires_grad
        )
        self.grid_v_in = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )
        self.grid_v_out = wp.zeros(
            (10, 10, 10), dtype=wp.vec3, device=device, requires_grad=requires_grad
        )

    def init_grid(
        self, grid_res: int, device: wp.context.Devicelike = None, requires_grad=False
    ):
        self.grid_m = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=float,
            device=device,
            requires_grad=False,
        )
        self.grid_v_in = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )
        self.grid_v_out = wp.zeros(
            (grid_res, grid_res, grid_res),
            dtype=wp.vec3,
            device=device,
            requires_grad=requires_grad,
        )

    def from_torch(
        self,
        tensor_x: Tensor,
        tensor_volume: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        n_grid: int = 100,
        grid_lim=1.0,
        device="cuda:0",
        requires_grad=True,
    ):
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]
        assert tensor_x.shape[0] == tensor_volume.shape[0]
        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]
        self.init_grid(grid_res=n_grid, device=device, requires_grad=requires_grad)

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_volume is not None:
            print(self.particle_vol.shape, tensor_volume.shape)
            volume_numpy = tensor_volume.detach().cpu().numpy()
            self.particle_vol = wp.from_numpy(
                volume_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        # initial trial deformation gradient is set to identity

        print("Particles initialized from torch data.")
        print("Total particles: ", n_particles)

    def reset_state(
        self,
        tensor_x: Tensor,
        tensor_cov: Optional[Tensor] = None,
        tensor_velocity: Optional[Tensor] = None,
        tensor_density: Optional[Tensor] = None,
        selection_mask: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
    ):
        # reset p_c, p_v, p_C, p_F_trial
        num_dim, n_particles = tensor_x.shape[1], tensor_x.shape[0]

        # assert tensor_x.shape[0] == tensor_cov.reshape(-1, 6).shape[0]

        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_cov is not None:
            cov_numpy = tensor_cov.reshape(-1).detach().clone().cpu().numpy()
            self.particle_cov = wp.from_numpy(
                cov_numpy, dtype=float, device=device, requires_grad=False
            )

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_density is not None and selection_mask is not None:
            wp_density = from_torch_safe(
                tensor_density.contiguous().detach().clone(),
                dtype=wp.float32,
                requires_grad=False,
            )
            # 1 indicate we need to simulate this particle
            wp_selection_mask = from_torch_safe(
                selection_mask.contiguous().detach().clone().type(torch.int),
                dtype=wp.int32,
                requires_grad=False,
            )

            wp.launch(
                kernel=set_float_vec_to_vec_wmask,
                dim=n_particles,
                inputs=[self.particle_density, wp_density, wp_selection_mask],
                device=device,
            )

        # initial deformation gradient is set to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F_trial],
            device=device,
        )
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[self.particle_F],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_C],
            device=device,
        )

        wp.launch(
            kernel=set_mat33_to_zero,
            dim=n_particles,
            inputs=[self.particle_stress],
            device=device,
        )

    def continue_from_torch(
        self,
        tensor_x: Tensor,
        tensor_velocity: Optional[Tensor] = None,
        tensor_F: Optional[Tensor] = None,
        tensor_C: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
    ):
        if tensor_x is not None:
            self.particle_x = from_torch_safe(
                tensor_x.contiguous().detach(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_velocity is not None:
            self.particle_v = from_torch_safe(
                tensor_velocity.contiguous().detach().clone(),
                dtype=wp.vec3,
                requires_grad=requires_grad,
            )

        if tensor_F is not None:
            self.particle_F_trial = from_torch_safe(
                tensor_F.contiguous().detach().clone(),
                dtype=wp.mat33,
                requires_grad=requires_grad,
            )

        if tensor_C is not None:
            self.particle_C = from_torch_safe(
                tensor_C.contiguous().detach().clone(),
                dtype=wp.mat33,
                requires_grad=requires_grad,
            )

    def set_require_grad(self, requires_grad=True):
        self.particle_x.requires_grad = requires_grad
        self.particle_v.requires_grad = requires_grad
        self.particle_F.requires_grad = requires_grad
        self.particle_F_trial.requires_grad = requires_grad
        self.particle_stress.requires_grad = requires_grad
        self.particle_C.requires_grad = requires_grad

        self.grid_v_out.requires_grad = requires_grad
        self.grid_v_in.requires_grad = requires_grad

    def reset_density(
        self,
        tensor_density: Tensor,
        selection_mask: Optional[Tensor] = None,
        device="cuda:0",
        requires_grad=True,
        update_mass=False,
    ):
        n_particles = tensor_density.shape[0]
        if tensor_density is not None:
            wp_density = from_torch_safe(
                tensor_density.contiguous().detach().clone(),
                dtype=wp.float32,
                requires_grad=False,
            )
        
        if selection_mask is not None:
            # 1 indicate we need to simulate this particle
            wp_selection_mask = from_torch_safe(
                selection_mask.contiguous().detach().clone().type(torch.int),
                dtype=wp.int32,
                requires_grad=False,
            )

            wp.launch(
                kernel=set_float_vec_to_vec_wmask,
                dim=n_particles,
                inputs=[self.particle_density, wp_density, wp_selection_mask],
                device=device,
            )
        else:
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=n_particles,
                inputs=[self.particle_density, wp_density],
                device=device,
            )

        if update_mass:
            num_particles = self.particle_x.shape[0]
            wp.launch(
                kernel=get_float_array_product,
                dim=num_particles,
                inputs=[
                    self.particle_density,
                    self.particle_vol,
                    self.particle_mass,
                ],
                device=device,
            )

    def partial_clone(self, device="cuda:0", requires_grad=True):
        new_state = MPMStateStruct()
        n_particles = self.particle_x.shape[0]
        new_state.init(n_particles, device=device, requires_grad=requires_grad)

        # clone section:
        # new_state.particle_vol = wp.clone(self.particle_vol, requires_grad=False)
        # new_state.particle_density = wp.clone(self.particle_density, requires_grad=False)
        # new_state.particle_mass = wp.clone(self.particle_mass, requires_grad=False)

        # new_state.particle_selection = wp.clone(self.particle_selection, requires_grad=False)

        wp.copy(new_state.particle_vol, self.particle_vol)
        wp.copy(new_state.particle_density, self.particle_density)
        wp.copy(new_state.particle_mass, self.particle_mass)
        wp.copy(new_state.particle_selection, self.particle_selection)

        # init grid to zero with grid res.
        new_state.init_grid(
            grid_res=self.grid_v_in.shape[0], device=device, requires_grad=requires_grad
        )

        # init some matrix to identity
        wp.launch(
            kernel=set_mat33_to_identity,
            dim=n_particles,
            inputs=[new_state.particle_F_trial],
            device=device,
        )

        new_state.set_require_grad(requires_grad=requires_grad)
        return new_state


@wp.struct
class MPMModelStruct(object):
    ####### essential #######
    grid_lim: float
    n_particles: int
    n_grid: int
    dx: float
    inv_dx: float
    grid_dim_x: int
    grid_dim_y: int
    grid_dim_z: int
    mu: wp.array(dtype=float)
    lam: wp.array(dtype=float)
    E: wp.array(dtype=float)
    nu: wp.array(dtype=float)
    material: int

    ######## for plasticity ####
    yield_stress: wp.array(dtype=float)
    friction_angle: float
    alpha: float
    gravitational_accelaration: wp.vec3
    hardening: float
    xi: float
    plastic_viscosity: float
    softening: float

    ####### for damping
    rpic_damping: float
    grid_v_damping_scale: float

    ####### for PhysGaussian: covariance
    update_cov_with_F: int

    def init(
        self,
        shape: Union[Sequence[int], int],
        device: wp.context.Devicelike = None,
        requires_grad=False,
    ) -> None:
        self.E = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # young's modulus
        self.nu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )  # poisson's ratio

        self.mu = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )
        self.lam = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

        self.yield_stress = wp.zeros(
            shape, dtype=float, device=device, requires_grad=requires_grad
        )

    def finalize_mu_lam(self, n_particles, device="cuda:0"):
        wp.launch(
            kernel=compute_mu_lam_from_E_nu_clean,
            dim=n_particles,
            inputs=[self.mu, self.lam, self.E, self.nu],
            device=device,
        )

    def init_other_params(self, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.grid_lim = grid_lim
        self.n_grid = n_grid
        self.grid_dim_x = n_grid
        self.grid_dim_y = n_grid
        self.grid_dim_z = n_grid
        (
            self.dx,
            self.inv_dx,
        ) = self.grid_lim / self.n_grid, float(
            n_grid / grid_lim
        )  # [0-1]?

        self.update_cov_with_F = False

        # material is used to switch between different elastoplastic models. 0 is jelly
        self.material = 0

        self.plastic_viscosity = 0.0
        self.softening = 0.1
        self.friction_angle = 25.0
        sin_phi = wp.sin(self.friction_angle / 180.0 * 3.14159265)
        self.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        self.gravitational_accelaration = wp.vec3(0.0, 0.0, 0.0)

        self.rpic_damping = 0.0  # 0.0 if no damping (apic). -1 if pic

        self.grid_v_damping_scale = 1.1  # globally applied

    def from_torch(
        self, tensor_E: Tensor, tensor_nu: Tensor, device="cuda:0", requires_grad=False
    ):
        self.E = wp.from_torch(tensor_E.contiguous(), requires_grad=requires_grad)
        self.nu = wp.from_torch(tensor_nu.contiguous(), requires_grad=requires_grad)
        n_particles = tensor_E.shape[0]
        self.finalize_mu_lam(n_particles=n_particles, device=device)

    def set_require_grad(self, requires_grad=True):
        self.E.requires_grad = requires_grad
        self.nu.requires_grad = requires_grad
        self.mu.requires_grad = requires_grad
        self.lam.requires_grad = requires_grad


# for various boundary conditions
@wp.struct
class Dirichlet_collider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float

    friction: float
    surface_type: int

    velocity: wp.vec3

    threshold: float
    reset: int
    index: int

    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    height: float
    length: float
    R: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3
    half_height_and_radius: wp.vec2


@wp.struct
class GridCollider:
    point: wp.vec3
    normal: wp.vec3
    direction: wp.vec3

    start_time: float
    end_time: float
    mask: wp.array(dtype=int, ndim=3)


@wp.struct
class Impulse_modifier:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    force: wp.vec3
    forceTimesDt: wp.vec3
    numsteps: int

    point: wp.vec3
    size: wp.vec3
    mask: wp.array(dtype=int)


@wp.struct
class MPMtailoredStruct:
    # this needs to be changed for each different BC!
    point: wp.vec3
    normal: wp.vec3
    start_time: float
    end_time: float
    friction: float
    surface_type: int
    velocity: wp.vec3
    threshold: float
    reset: int

    point_rotate: wp.vec3
    normal_rotate: wp.vec3
    x_unit: wp.vec3
    y_unit: wp.vec3
    radius: float
    v_scale: float
    width: float
    point_plane: wp.vec3
    normal_plane: wp.vec3
    velocity_plane: wp.vec3
    threshold_plane: float


@wp.struct
class MaterialParamsModifier:
    point: wp.vec3
    size: wp.vec3
    E: float
    nu: float
    density: float


@wp.struct
class ParticleVelocityModifier:
    point: wp.vec3
    normal: wp.vec3
    half_height_and_radius: wp.vec2
    rotation_scale: float
    translation_scale: float

    size: wp.vec3

    horizontal_axis_1: wp.vec3
    horizontal_axis_2: wp.vec3

    start_time: float

    end_time: float

    velocity: wp.vec3

    mask: wp.array(dtype=int)


@wp.kernel
def compute_mu_lam_from_E_nu_clean(
    mu: wp.array(dtype=float),
    lam: wp.array(dtype=float),
    E: wp.array(dtype=float),
    nu: wp.array(dtype=float),
):
    p = wp.tid()
    mu[p] = E[p] / (2.0 * (1.0 + nu[p]))
    lam[p] = E[p] * nu[p] / ((1.0 + nu[p]) * (1.0 - 2.0 * nu[p]))


@wp.kernel
def set_vec3_to_zero(target_array: wp.array(dtype=wp.vec3)):
    tid = wp.tid()
    target_array[tid] = wp.vec3(0.0, 0.0, 0.0)


@wp.kernel
def set_vec3_to_vec3(
    source_array: wp.array(dtype=wp.vec3), target_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    source_array[tid] = target_array[tid]


@wp.kernel
def set_float_vec_to_vec_wmask(
    source_array: wp.array(dtype=float),
    target_array: wp.array(dtype=float),
    selection_mask: wp.array(dtype=int),
):
    tid = wp.tid()
    if selection_mask[tid] == 1:
        source_array[tid] = target_array[tid]


@wp.kernel
def set_float_vec_to_vec(
    source_array: wp.array(dtype=float), target_array: wp.array(dtype=float)
):
    tid = wp.tid()
    source_array[tid] = target_array[tid]


@wp.kernel
def set_mat33_to_identity(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)


@wp.kernel
def set_mat33_to_zero(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


@wp.kernel
def add_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.add(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def subtract_identity_to_mat33(target_array: wp.array(dtype=wp.mat33)):
    tid = wp.tid()
    target_array[tid] = wp.sub(
        target_array[tid], wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    )


@wp.kernel
def add_vec3_to_vec3(
    first_array: wp.array(dtype=wp.vec3), second_array: wp.array(dtype=wp.vec3)
):
    tid = wp.tid()
    first_array[tid] = wp.add(first_array[tid], second_array[tid])


@wp.kernel
def set_value_to_float_array(target_array: wp.array(dtype=float), value: float):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def set_warpvalue_to_float_array(
    target_array: wp.array(dtype=float), value: warp.types.float32
):
    tid = wp.tid()
    target_array[tid] = value


@wp.kernel
def get_float_array_product(
    arrayA: wp.array(dtype=float),
    arrayB: wp.array(dtype=float),
    arrayC: wp.array(dtype=float),
):
    tid = wp.tid()
    arrayC[tid] = arrayA[tid] * arrayB[tid]


def torch2warp_quat(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 4
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.quat,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_float(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=warp.types.float32,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_vec3(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.vec3,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


def torch2warp_mat33(t, copy=False, dtype=warp.types.float32, dvc="cuda:0"):
    assert t.is_contiguous()
    if t.dtype != torch.float32 and t.dtype != torch.int32:
        raise RuntimeError(
            "Error aliasing Torch tensor to Warp array. Torch tensor must be float32 or int32 type"
        )
    assert t.shape[1] == 3
    a = warp.types.array(
        ptr=t.data_ptr(),
        dtype=wp.mat33,
        shape=t.shape[0],
        copy=False,
        owner=False,
        requires_grad=t.requires_grad,
        # device=t.device.type)
        device=dvc,
    )
    a.tensor = t
    return a


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/mpm_solver_diff.py
================================================
import sys
import os

import warp as wp

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from mpm_data_structure import *
from mpm_utils import *
from typing import Optional, Union, Sequence, Any, Tuple
from jaxtyping import Float, Int, Shaped


class MPMWARPDiff(object):
    # def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
    #     self.initialize(n_particles, n_grid, grid_lim, device=device)
    #     self.time_profile = {}

    def __init__(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.initialize(n_particles, n_grid, grid_lim, device=device)
        self.time_profile = {}

    def initialize(self, n_particles, n_grid=100, grid_lim=1.0, device="cuda:0"):
        self.n_particles = n_particles

        self.time = 0.0

        self.grid_postprocess = []
        self.collider_params = []
        self.modify_bc = []

        self.tailored_struct_for_bc = MPMtailoredStruct()
        self.pre_p2g_operations = []
        self.impulse_params = []

        self.particle_velocity_modifiers = []
        self.particle_velocity_modifier_params = []

    # must give density. mass will be updated as density * volume
    def set_parameters(self, device="cuda:0", **kwargs):
        self.set_parameters_dict(device, kwargs)

    def set_parameters_dict(self, mpm_model, mpm_state, kwargs={}, device="cuda:0"):
        if "material" in kwargs:
            if kwargs["material"] == "jelly":
                mpm_model.material = 0
            elif kwargs["material"] == "metal":
                mpm_model.material = 1
            elif kwargs["material"] == "sand":
                mpm_model.material = 2
            elif kwargs["material"] == "foam":
                mpm_model.material = 3
            elif kwargs["material"] == "snow":
                mpm_model.material = 4
            elif kwargs["material"] == "plasticine":
                mpm_model.material = 5
            elif kwargs["material"] == "neo-hookean":
                mpm_model.material = 6
            else:
                raise TypeError("Undefined material type")

        if "yield_stress" in kwargs:
            val = kwargs["yield_stress"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.yield_stress, val],
                device=device,
            )
        if "hardening" in kwargs:
            mpm_model.hardening = kwargs["hardening"]
        if "xi" in kwargs:
            mpm_model.xi = kwargs["xi"]
        if "friction_angle" in kwargs:
            mpm_model.friction_angle = kwargs["friction_angle"]
            sin_phi = wp.sin(mpm_model.friction_angle / 180.0 * 3.14159265)
            mpm_model.alpha = wp.sqrt(2.0 / 3.0) * 2.0 * sin_phi / (3.0 - sin_phi)

        if "g" in kwargs:
            mpm_model.gravitational_accelaration = wp.vec3(
                kwargs["g"][0], kwargs["g"][1], kwargs["g"][2]
            )

        if "density" in kwargs:
            density_value = kwargs["density"]
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_state.particle_density, density_value],
                device=device,
            )
            wp.launch(
                kernel=get_float_array_product,
                dim=self.n_particles,
                inputs=[
                    mpm_state.particle_density,
                    mpm_state.particle_vol,
                    mpm_state.particle_mass,
                ],
                device=device,
            )
        if "rpic_damping" in kwargs:
            mpm_model.rpic_damping = kwargs["rpic_damping"]
        if "plastic_viscosity" in kwargs:
            mpm_model.plastic_viscosity = kwargs["plastic_viscosity"]
        if "softening" in kwargs:
            mpm_model.softening = kwargs["softening"]
        if "grid_v_damping_scale" in kwargs:
            mpm_model.grid_v_damping_scale = kwargs["grid_v_damping_scale"]

    def set_E_nu(self, mpm_model, E: float, nu: float, device="cuda:0"):
        if isinstance(E, float):
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.E, E],
                device=device,
            )
        else:  # E is warp array
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=self.n_particles,
                inputs=[mpm_model.E, E],
                device=device,
            )

        if isinstance(nu, float):
            wp.launch(
                kernel=set_value_to_float_array,
                dim=self.n_particles,
                inputs=[mpm_model.nu, nu],
                device=device,
            )
        else:
            wp.launch(
                kernel=set_float_vec_to_vec,
                dim=self.n_particles,
                inputs=[mpm_model.nu, nu],
                device=device,
            )

    def set_E_nu_from_torch(
        self,
        mpm_model,
        E: Float[Tensor, "n"] | Float[Tensor, "1"],
        nu: Float[Tensor, "n"] | Float[Tensor, "1"],
        device="cuda:0",
    ):
        if E.ndim == 0:
            E_inp = E.item()  # float
        else:
            E_inp = from_torch_safe(E, dtype=wp.float32, requires_grad=True)

        if nu.ndim == 0:
            nu_inp = nu.item()  # float
        else:
            nu_inp = from_torch_safe(nu, dtype=wp.float32, requires_grad=True)

        self.set_E_nu(mpm_model, E_inp, nu_inp, device=device)

    def prepare_mu_lam(self, mpm_model, mpm_state, device="cuda:0"):
        # compute mu and lam from E and nu
        wp.launch(
            kernel=compute_mu_lam_from_E_nu,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )

    def p2g2p_differentiable(
        self, mpm_model, mpm_state, next_state, dt, device="cuda:0"
    ):
        """
        Some boundary conditions, might not give gradient,
        see kernels in
            self.pre_p2g_operations,    Usually None.
            self.particle_velocity_modifiers.   Mostly used to freeze points
            self.grid_postprocess,      Should apply BC here
        """
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )
        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        # apply impulse force on particles..
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, mpm_state, self.impulse_params[k]],
                device=device,
            )

        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        # F_trail => F                    # TODO: this is overite..
        # F, SVD(F), lam, mu => Stress.   # TODO: this is overite..

        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

        if mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid, collide
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        mpm_state,
                        mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p_differentiable,
                dim=self.n_particles,
                inputs=[mpm_state, next_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        self.time = self.time + dt

    def p2g2p(self, mpm_model, mpm_state, step, dt, device="cuda:0"):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # apply pre-p2g operations on particles
        # apply impulse force on particles..
        for k in range(len(self.pre_p2g_operations)):
            wp.launch(
                kernel=self.pre_p2g_operations[k],
                dim=self.n_particles,
                inputs=[self.time, dt, mpm_state, self.impulse_params[k]],
                device=device,
            )

        # apply dirichlet particle v modifier
        for k in range(len(self.particle_velocity_modifiers)):
            wp.launch(
                kernel=self.particle_velocity_modifiers[k],
                dim=self.n_particles,
                inputs=[
                    self.time,
                    mpm_state,
                    self.particle_velocity_modifier_params[k],
                ],
                device=device,
            )

        # compute stress = stress(returnMap(F_trial))
        # F_trail => F                    # TODO: this is overite..
        # F, SVD(F), lam, mu => Stress.   # TODO: this is overite..

        with wp.ScopedTimer(
            "compute_stress_from_F_trial",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=compute_stress_from_F_trial,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # F and stress are updated

        # p2g
        with wp.ScopedTimer(
            "p2g",
            synchronize=True,
            print=False,
            dict=self.time_profile,
        ):
            wp.launch(
                kernel=p2g_apic_with_stress,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # apply p2g'

        # grid update
        with wp.ScopedTimer(
            "grid_update", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=grid_normalization_and_gravity,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )

        if mpm_model.grid_v_damping_scale < 1.0:
            wp.launch(
                kernel=add_damping_via_grid,
                dim=(grid_size),
                inputs=[mpm_state, mpm_model.grid_v_damping_scale],
                device=device,
            )

        # apply BC on grid, collide
        with wp.ScopedTimer(
            "apply_BC_on_grid", synchronize=True, print=False, dict=self.time_profile
        ):
            for k in range(len(self.grid_postprocess)):
                wp.launch(
                    kernel=self.grid_postprocess[k],
                    dim=grid_size,
                    inputs=[
                        self.time,
                        dt,
                        mpm_state,
                        mpm_model,
                        self.collider_params[k],
                    ],
                    device=device,
                )
                if self.modify_bc[k] is not None:
                    self.modify_bc[k](self.time, dt, self.collider_params[k])

        # g2p
        with wp.ScopedTimer(
            "g2p", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=g2p,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model, dt],
                device=device,
            )  # x, v, C, F_trial are updated

        #### CFL check ####
        # particle_v = self.mpm_state.particle_v.numpy()
        # if np.max(np.abs(particle_v)) > self.mpm_model.dx / dt:
        #     print("max particle v: ", np.max(np.abs(particle_v)))
        #     print("max allowed  v: ", self.mpm_model.dx / dt)
        #     print("does not allow v*dt>dx")
        #     input()
        #### CFL check ####
        with wp.ScopedTimer(
            "clip_particle_x", synchronize=True, print=False, dict=self.time_profile
        ):
            wp.launch(
                kernel=clip_particle_x,
                dim=self.n_particles,
                inputs=[mpm_state, mpm_model],
                device=device,
            )

        self.time = self.time + dt

    def print_time_profile(self):
        print("MPM Time profile:")
        for key, value in self.time_profile.items():
            print(key, sum(value))

    # a surface specified by a point and the normal vector
    def add_surface_collider(
        self,
        point,
        normal,
        surface="sticky",
        friction=0.0,
        start_time=0.0,
        end_time=999.0,
    ):
        point = list(point)
        # Normalize normal
        normal_scale = 1.0 / wp.sqrt(float(sum(x**2 for x in normal)))
        normal = list(normal_scale * x for x in normal)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.normal = wp.vec3(normal[0], normal[1], normal[2])

        if surface == "sticky" and friction != 0:
            raise ValueError("friction must be 0 on sticky surfaces.")
        if surface == "sticky":
            collider_param.surface_type = 0
        elif surface == "slip":
            collider_param.surface_type = 1
        elif surface == "cut":
            collider_param.surface_type = 11
        else:
            collider_param.surface_type = 2
        # frictional
        collider_param.friction = friction

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                n = wp.vec3(param.normal[0], param.normal[1], param.normal[2])
                dotproduct = wp.dot(offset, n)

                if dotproduct < 0.0:
                    if param.surface_type == 0:
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )
                    elif param.surface_type == 11:
                        if (
                            float(grid_z) * model.dx < 0.4
                            or float(grid_z) * model.dx > 0.53
                        ):
                            state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                                0.0, 0.0, 0.0
                            )
                        else:
                            v_in = state.grid_v_out[grid_x, grid_y, grid_z]
                            state.grid_v_out[grid_x, grid_y, grid_z] = (
                                wp.vec3(v_in[0], 0.0, v_in[2]) * 0.3
                            )
                    else:
                        v = state.grid_v_out[grid_x, grid_y, grid_z]
                        normal_component = wp.dot(v, n)
                        if param.surface_type == 1:
                            v = (
                                v - normal_component * n
                            )  # Project out all normal component
                        else:
                            v = (
                                v - wp.min(normal_component, 0.0) * n
                            )  # Project out only inward normal component
                        if normal_component < 0.0 and wp.length(v) > 1e-20:
                            v = wp.max(
                                0.0, wp.length(v) + normal_component * param.friction
                            ) * wp.normalize(
                                v
                            )  # apply friction here
                        state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                            0.0, 0.0, 0.0
                        )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # a cubiod is a rectangular cube'
    # centered at `point`
    # dimension is x: point[0]±size[0]
    #              y: point[1]±size[1]
    #              z: point[2]±size[2]
    # all grid nodes lie within the cubiod will have their speed set to velocity
    # the cuboid itself is also moving with const speed = velocity
    # set the speed to zero to fix BC
    def set_velocity_on_cuboid(
        self,
        point,
        size,
        velocity,
        start_time=0.0,
        end_time=999.0,
        reset=0,
    ):
        point = list(point)

        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time
        collider_param.point = wp.vec3(point[0], point[1], point[2])
        collider_param.size = size
        collider_param.velocity = wp.vec3(velocity[0], velocity[1], velocity[2])
        # collider_param.threshold = threshold
        collider_param.reset = reset
        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            if time >= param.start_time and time < param.end_time:
                offset = wp.vec3(
                    float(grid_x) * model.dx - param.point[0],
                    float(grid_y) * model.dx - param.point[1],
                    float(grid_z) * model.dx - param.point[2],
                )
                if (
                    wp.abs(offset[0]) < param.size[0]
                    and wp.abs(offset[1]) < param.size[1]
                    and wp.abs(offset[2]) < param.size[2]
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = param.velocity
            elif param.reset == 1:
                if time < param.end_time + 15.0 * dt:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        def modify(time, dt, param: Dirichlet_collider):
            if time >= param.start_time and time < param.end_time:
                param.point = wp.vec3(
                    param.point[0] + dt * param.velocity[0],
                    param.point[1] + dt * param.velocity[1],
                    param.point[2] + dt * param.velocity[2],
                )  # param.point + dt * param.velocity

        self.grid_postprocess.append(collide)
        self.modify_bc.append(modify)

    def add_bounding_box(self, start_time=0.0, end_time=999.0):
        collider_param = Dirichlet_collider()
        collider_param.start_time = start_time
        collider_param.end_time = end_time

        self.collider_params.append(collider_param)

        @wp.kernel
        def collide(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            param: Dirichlet_collider,
        ):
            grid_x, grid_y, grid_z = wp.tid()
            padding = 3
            if time >= param.start_time and time < param.end_time:
                if grid_x < padding and state.grid_v_out[grid_x, grid_y, grid_z][0] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_x >= model.grid_dim_x - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][0] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_y < padding and state.grid_v_out[grid_x, grid_y, grid_z][1] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )
                if (
                    grid_y >= model.grid_dim_y - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][1] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        0.0,
                        state.grid_v_out[grid_x, grid_y, grid_z][2],
                    )

                if grid_z < padding and state.grid_v_out[grid_x, grid_y, grid_z][2] < 0:
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )
                if (
                    grid_z >= model.grid_dim_z - padding
                    and state.grid_v_out[grid_x, grid_y, grid_z][2] > 0
                ):
                    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(
                        state.grid_v_out[grid_x, grid_y, grid_z][0],
                        state.grid_v_out[grid_x, grid_y, grid_z][1],
                        0.0,
                    )

        self.grid_postprocess.append(collide)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles(
        self,
        mpm_state,
        force,
        dt,
        point=[1, 1, 1],
        size=[1, 1, 1],
        num_dt=1,
        start_time=0.0,
        device="cuda:0",
    ):
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = start_time + dt * num_dt

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])
        impulse_param.mask = wp.zeros(shape=self.n_particles, dtype=int, device=device)

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] == 1:
                    impulse = wp.vec3(
                        param.force[0] / state.particle_mass[p],
                        param.force[1] / state.particle_mass[p],
                        param.force[2] / state.particle_mass[p],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)

    def enforce_particle_velocity_translation(
        self, mpm_state, point, size, velocity, start_time, end_time, device="cuda:0"
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.size = wp.vec3(size[0], size[1], size[2])

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0], velocity[1], velocity[2]
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_translation,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # define a cylinder with center point, half_height, radius, normal
    # particles within the cylinder are rotating along the normal direction
    # may also have a translational velocity along the normal direction
    def enforce_particle_velocity_rotation(
        self,
        mpm_state,
        point,
        normal,
        half_height_and_radius,
        rotation_scale,
        translation_scale,
        start_time,
        end_time,
        device="cuda:0",
    ):
        normal_scale = 1.0 / wp.sqrt(
            float(normal[0] ** 2 + normal[1] ** 2 + normal[2] ** 2)
        )
        normal = list(normal_scale * x for x in normal)

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.point = wp.vec3(point[0], point[1], point[2])
        velocity_modifier_params.half_height_and_radius = wp.vec2(
            half_height_and_radius[0], half_height_and_radius[1]
        )
        velocity_modifier_params.normal = wp.vec3(normal[0], normal[1], normal[2])

        horizontal_1 = wp.vec3(1.0, 1.0, 1.0)
        if wp.abs(wp.dot(velocity_modifier_params.normal, horizontal_1)) < 0.01:
            horizontal_1 = wp.vec3(0.72, 0.37, -0.67)
        horizontal_1 = (
            horizontal_1
            - wp.dot(horizontal_1, velocity_modifier_params.normal)
            * velocity_modifier_params.normal
        )
        horizontal_1 = horizontal_1 * (1.0 / wp.length(horizontal_1))
        horizontal_2 = wp.cross(horizontal_1, velocity_modifier_params.normal)

        velocity_modifier_params.horizontal_axis_1 = horizontal_1
        velocity_modifier_params.horizontal_axis_2 = horizontal_2

        velocity_modifier_params.rotation_scale = rotation_scale
        velocity_modifier_params.translation_scale = translation_scale

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.zeros(
            shape=self.n_particles, dtype=int, device=device
        )

        wp.launch(
            kernel=selection_enforce_particle_velocity_cylinder,
            dim=self.n_particles,
            inputs=[mpm_state, velocity_modifier_params],
            device=device,
        )
        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    offset = state.particle_x[p] - velocity_modifier_params.point
                    horizontal_distance = wp.length(
                        offset
                        - wp.dot(offset, velocity_modifier_params.normal)
                        * velocity_modifier_params.normal
                    )
                    cosine = (
                        wp.dot(offset, velocity_modifier_params.horizontal_axis_1)
                        / horizontal_distance
                    )
                    theta = wp.acos(cosine)
                    if wp.dot(offset, velocity_modifier_params.horizontal_axis_2) > 0:
                        theta = theta
                    else:
                        theta = -theta
                    axis1_scale = (
                        -horizontal_distance
                        * wp.sin(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis2_scale = (
                        horizontal_distance
                        * wp.cos(theta)
                        * velocity_modifier_params.rotation_scale
                    )
                    axis_vertical_scale = translation_scale
                    state.particle_v[p] = (
                        axis1_scale * velocity_modifier_params.horizontal_axis_1
                        + axis2_scale * velocity_modifier_params.horizontal_axis_2
                        + axis_vertical_scale * velocity_modifier_params.normal
                    )

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    # given normal direction, say [0,0,1]
    # gradually release grid velocities from start position to end position
    def release_particles_sequentially(
        self, normal, start_position, end_position, num_layers, start_time, end_time
    ):
        num_layers = 50
        point = [0, 0, 0]
        size = [0, 0, 0]
        axis = -1
        for i in range(3):
            if normal[i] == 0:
                point[i] = 1
                size[i] = 1
            else:
                axis = i
                point[i] = end_position

        half_length_portion = wp.abs(start_position - end_position) / num_layers
        end_time_portion = end_time / num_layers
        for i in range(num_layers):
            size[axis] = half_length_portion * (num_layers - i)
            self.enforce_particle_velocity_translation(
                point=point,
                size=size,
                velocity=[0, 0, 0],
                start_time=start_time,
                end_time=end_time_portion * (i + 1),
            )

    def enforce_particle_velocity_by_mask(
        self,
        mpm_state,
        selection_mask: torch.Tensor,
        velocity,
        start_time,
        end_time,
    ):
        # first select certain particles based on position

        velocity_modifier_params = ParticleVelocityModifier()

        velocity_modifier_params.velocity = wp.vec3(
            velocity[0],
            velocity[1],
            velocity[2],
        )

        velocity_modifier_params.start_time = start_time
        velocity_modifier_params.end_time = end_time

        velocity_modifier_params.mask = wp.from_torch(selection_mask)

        self.particle_velocity_modifier_params.append(velocity_modifier_params)

        @wp.kernel
        def modify_particle_v_before_p2g(
            time: float,
            state: MPMStateStruct,
            velocity_modifier_params: ParticleVelocityModifier,
        ):
            p = wp.tid()
            if (
                time >= velocity_modifier_params.start_time
                and time < velocity_modifier_params.end_time
            ):
                if velocity_modifier_params.mask[p] == 1:
                    state.particle_v[p] = velocity_modifier_params.velocity

        self.particle_velocity_modifiers.append(modify_particle_v_before_p2g)

    def restart_and_compute_F_C(self, mpm_model, mpm_state, target_pos, device):
        grid_size = (
            mpm_model.grid_dim_x,
            mpm_model.grid_dim_y,
            mpm_model.grid_dim_z,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        wp.launch(
            set_F_C_p2g,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model, target_pos],
            device=device,
        )

        wp.launch(
            kernel=grid_normalization_and_gravity,
            dim=(grid_size),
            inputs=[mpm_state, mpm_model, 0],
            device=device,
        )

        wp.launch(
            set_F_C_g2p,
            dim=self.n_particles,
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        wp.launch(
            kernel=zero_grid,  # gradient might gone
            dim=(grid_size),
            inputs=[mpm_state, mpm_model],
            device=device,
        )

        # set position to target_pos
        wp.launch(
            kernel=set_vec3_to_vec3,
            dim=self.n_particles,
            inputs=[mpm_state.particle_x, target_pos],
            device=device,
        )

    def enforce_grid_velocity_by_mask(
        self,
        selection_mask: torch.Tensor,  # should be int
    ):

        grid_modifier_params = GridCollider()

        grid_modifier_params.mask = wp.from_torch(selection_mask)

        self.collider_params.append(grid_modifier_params)

        @wp.kernel
        def modify_grid_v_before_g2p(
            time: float,
            dt: float,
            state: MPMStateStruct,
            model: MPMModelStruct,
            grid_modifier_params: GridCollider,
        ):
            grid_x, grid_y, grid_z = wp.tid()

            if grid_modifier_params.mask[grid_x, grid_y, grid_z] >= 1:
                state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)

        self.grid_postprocess.append(modify_grid_v_before_g2p)
        self.modify_bc.append(None)

    # particle_v += force/particle_mass * dt
    # this is applied from start_dt, ends after num_dt p2g2p's
    # particle velocity is changed before p2g at each timestep
    def add_impulse_on_particles_with_mask(
        self,
        mpm_state,
        force,
        dt,
        particle_mask,  # 1 for selected particles, 0 for others
        point=[1, 1, 1],
        size=[1, 1, 1],
        end_time=1,
        start_time=0.0,
        device="cuda:0",
    ):
        assert (
            len(particle_mask) == self.n_particles
        ), "mask should have n_particles elements"
        impulse_param = Impulse_modifier()
        impulse_param.start_time = start_time
        impulse_param.end_time = end_time
        impulse_param.mask = wp.from_torch(particle_mask)

        impulse_param.point = wp.vec3(point[0], point[1], point[2])
        impulse_param.size = wp.vec3(size[0], size[1], size[2])

        impulse_param.force = wp.vec3(
            force[0],
            force[1],
            force[2],
        )

        wp.launch(
            kernel=selection_add_impulse_on_particles,
            dim=self.n_particles,
            inputs=[mpm_state, impulse_param],
            device=device,
        )

        self.impulse_params.append(impulse_param)

        @wp.kernel
        def apply_force(
            time: float, dt: float, state: MPMStateStruct, param: Impulse_modifier
        ):
            p = wp.tid()
            if time >= param.start_time and time < param.end_time:
                if param.mask[p] >= 1:
                    # impulse = wp.vec3(
                    #     param.force[0] / state.particle_mass[p],
                    #     param.force[1] / state.particle_mass[p],
                    #     param.force[2] / state.particle_mass[p],
                    # )
                    impulse = wp.vec3(
                        param.force[0],
                        param.force[1],
                        param.force[2],
                    )
                    state.particle_v[p] = state.particle_v[p] + impulse * dt

        self.pre_p2g_operations.append(apply_force)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/mpm_utils.py
================================================
import warp as wp
from mpm_data_structure import *
import numpy as np
import math


# compute stress from F
@wp.func
def kirchoff_stress_FCR(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, mu: float, lam: float
):
    # compute kirchoff stress for FCR model (remember tau = P F^T)
    R = U * wp.transpose(V)
    id = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    return 2.0 * mu * (F - R) * wp.transpose(F) + id * lam * J * (J - 1.0)


@wp.func
def kirchoff_stress_neoHookean(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, J: float, sig: wp.vec3, mu: float, lam: float
):
    """
    B = F * wp.transpose(F)
    dev(B) = B - (1/3) * tr(B) * I

    For a compressible Rivlin neo-Hookean materia, the cauchy stress is given by:
    mu * J^(-2/3) * dev(B) + lam * J (J - 1) * I
    see: https://en.wikipedia.org/wiki/Neo-Hookean_solid
    """

    # compute kirchoff stress for FCR model (remember tau = P F^T)
    b = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    b_hat = b - wp.vec3(
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
        (b[0] + b[1] + b[2]) / 3.0,
    )
    tau = mu * J ** (-2.0 / 3.0) * b_hat + lam / 2.0 * (J * J - 1.0) * wp.vec3(
        1.0, 1.0, 1.0
    )

    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_StVK(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    sig = wp.vec3(
        wp.max(sig[0], 0.01), wp.max(sig[1], 0.01), wp.max(sig[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    ONE = wp.vec3(1.0, 1.0, 1.0)
    tau = 2.0 * mu * epsilon + lam * log_sig_sum * ONE
    return (
        U
        * wp.mat33(tau[0], 0.0, 0.0, 0.0, tau[1], 0.0, 0.0, 0.0, tau[2])
        * wp.transpose(V)
        * wp.transpose(F)
    )


@wp.func
def kirchoff_stress_drucker_prager(
    F: wp.mat33, U: wp.mat33, V: wp.mat33, sig: wp.vec3, mu: float, lam: float
):
    log_sig_sum = wp.log(sig[0]) + wp.log(sig[1]) + wp.log(sig[2])
    center00 = 2.0 * mu * wp.log(sig[0]) * (1.0 / sig[0]) + lam * log_sig_sum * (
        1.0 / sig[0]
    )
    center11 = 2.0 * mu * wp.log(sig[1]) * (1.0 / sig[1]) + lam * log_sig_sum * (
        1.0 / sig[1]
    )
    center22 = 2.0 * mu * wp.log(sig[2]) * (1.0 / sig[2]) + lam * log_sig_sum * (
        1.0 / sig[2]
    )
    center = wp.mat33(center00, 0.0, 0.0, 0.0, center11, 0.0, 0.0, 0.0, center22)
    return U * center * wp.transpose(V) * wp.transpose(F)


@wp.func
def von_mises_return_mapping(F_trial: wp.mat33, model: MPMModelStruct, p: int):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


@wp.func
def von_mises_return_mapping_with_damage(
    F_trial: wp.mat33, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    temp = (epsilon[0] + epsilon[1] + epsilon[2]) / 3.0

    tau = 2.0 * model.mu[p] * epsilon + model.lam[p] * (
        epsilon[0] + epsilon[1] + epsilon[2]
    ) * wp.vec3(1.0, 1.0, 1.0)
    sum_tau = tau[0] + tau[1] + tau[2]
    cond = wp.vec3(
        tau[0] - sum_tau / 3.0, tau[1] - sum_tau / 3.0, tau[2] - sum_tau / 3.0
    )
    if wp.length(cond) > model.yield_stress[p]:
        if model.yield_stress[p] <= 0:
            return F_trial
        epsilon_hat = epsilon - wp.vec3(temp, temp, temp)
        epsilon_hat_norm = wp.length(epsilon_hat) + 1e-6
        delta_gamma = epsilon_hat_norm - model.yield_stress[p] / (2.0 * model.mu[p])
        epsilon = epsilon - (delta_gamma / epsilon_hat_norm) * epsilon_hat
        model.yield_stress[p] = model.yield_stress[p] - model.softening * wp.length(
            (delta_gamma / epsilon_hat_norm) * epsilon_hat
        )
        if model.yield_stress[p] <= 0:
            model.mu[p] = 0.0
            model.lam[p] = 0.0
        sig_elastic = wp.mat33(
            wp.exp(epsilon[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        if model.hardening == 1:
            model.yield_stress[p] = (
                model.yield_stress[p] + 2.0 * model.mu[p] * model.xi * delta_gamma
            )
        return F_elastic
    else:
        return F_trial


# for toothpaste
@wp.func
def viscoplasticity_return_mapping_with_StVK(
    F_trial: wp.mat33, model: MPMModelStruct, p: int, dt: float
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig_old = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig_old, V)

    sig = wp.vec3(
        wp.max(sig_old[0], 0.01), wp.max(sig_old[1], 0.01), wp.max(sig_old[2], 0.01)
    )  # add this to prevent NaN in extrem cases
    b_trial = wp.vec3(sig[0] * sig[0], sig[1] * sig[1], sig[2] * sig[2])
    epsilon = wp.vec3(wp.log(sig[0]), wp.log(sig[1]), wp.log(sig[2]))
    trace_epsilon = epsilon[0] + epsilon[1] + epsilon[2]
    epsilon_hat = epsilon - wp.vec3(
        trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
    )
    s_trial = 2.0 * model.mu[p] * epsilon_hat
    s_trial_norm = wp.length(s_trial)
    y = s_trial_norm - wp.sqrt(2.0 / 3.0) * model.yield_stress[p]
    if y > 0:
        mu_hat = model.mu[p] * (b_trial[0] + b_trial[1] + b_trial[2]) / 3.0
        s_new_norm = s_trial_norm - y / (
            1.0 + model.plastic_viscosity / (2.0 * mu_hat * dt)
        )
        s_new = (s_new_norm / s_trial_norm) * s_trial
        epsilon_new = 1.0 / (2.0 * model.mu[p]) * s_new + wp.vec3(
            trace_epsilon / 3.0, trace_epsilon / 3.0, trace_epsilon / 3.0
        )
        sig_elastic = wp.mat33(
            wp.exp(epsilon_new[0]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[1]),
            0.0,
            0.0,
            0.0,
            wp.exp(epsilon_new[2]),
        )
        F_elastic = U * sig_elastic * wp.transpose(V)
        return F_elastic
    else:
        return F_trial


@wp.func
def sand_return_mapping(
    F_trial: wp.mat33, state: MPMStateStruct, model: MPMModelStruct, p: int
):
    U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
    sig = wp.vec3(0.0)
    wp.svd3(F_trial, U, sig, V)

    epsilon = wp.vec3(
        wp.log(wp.max(wp.abs(sig[0]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[1]), 1e-14)),
        wp.log(wp.max(wp.abs(sig[2]), 1e-14)),
    )
    sigma_out = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    tr = epsilon[0] + epsilon[1] + epsilon[2]  # + state.particle_Jp[p]
    epsilon_hat = epsilon - wp.vec3(tr / 3.0, tr / 3.0, tr / 3.0)
    epsilon_hat_norm = wp.length(epsilon_hat)
    delta_gamma = (
        epsilon_hat_norm
        + (3.0 * model.lam[p] + 2.0 * model.mu[p])
        / (2.0 * model.mu[p])
        * tr
        * model.alpha
    )

    if delta_gamma <= 0:
        F_elastic = F_trial

    if delta_gamma > 0 and tr > 0:
        F_elastic = U * wp.transpose(V)

    if delta_gamma > 0 and tr <= 0:
        H = epsilon - epsilon_hat * (delta_gamma / epsilon_hat_norm)
        s_new = wp.vec3(wp.exp(H[0]), wp.exp(H[1]), wp.exp(H[2]))

        F_elastic = U * wp.diag(s_new) * wp.transpose(V)
    return F_elastic


@wp.kernel
def compute_mu_lam_from_E_nu(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    model.mu[p] = model.E[p] / (2.0 * (1.0 + model.nu[p]))
    model.lam[p] = (
        model.E[p] * model.nu[p] / ((1.0 + model.nu[p]) * (1.0 - 2.0 * model.nu[p]))
    )


@wp.kernel
def zero_grid(state: MPMStateStruct, model: MPMModelStruct):
    grid_x, grid_y, grid_z = wp.tid()
    state.grid_m[grid_x, grid_y, grid_z] = 0.0
    state.grid_v_in[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)
    state.grid_v_out[grid_x, grid_y, grid_z] = wp.vec3(0.0, 0.0, 0.0)


@wp.func
def compute_dweight(
    model: MPMModelStruct, w: wp.mat33, dw: wp.mat33, i: int, j: int, k: int
):
    dweight = wp.vec3(
        dw[0, i] * w[1, j] * w[2, k],
        w[0, i] * dw[1, j] * w[2, k],
        w[0, i] * w[1, j] * dw[2, k],
    )
    return dweight * model.inv_dx


@wp.func
def update_cov(state: MPMStateStruct, p: int, grad_v: wp.mat33, dt: float):
    cov_n = wp.mat33(0.0)
    cov_n[0, 0] = state.particle_cov[p * 6]
    cov_n[0, 1] = state.particle_cov[p * 6 + 1]
    cov_n[0, 2] = state.particle_cov[p * 6 + 2]
    cov_n[1, 0] = state.particle_cov[p * 6 + 1]
    cov_n[1, 1] = state.particle_cov[p * 6 + 3]
    cov_n[1, 2] = state.particle_cov[p * 6 + 4]
    cov_n[2, 0] = state.particle_cov[p * 6 + 2]
    cov_n[2, 1] = state.particle_cov[p * 6 + 4]
    cov_n[2, 2] = state.particle_cov[p * 6 + 5]

    cov_np1 = cov_n + dt * (grad_v * cov_n + cov_n * wp.transpose(grad_v))

    state.particle_cov[p * 6] = cov_np1[0, 0]
    state.particle_cov[p * 6 + 1] = cov_np1[0, 1]
    state.particle_cov[p * 6 + 2] = cov_np1[0, 2]
    state.particle_cov[p * 6 + 3] = cov_np1[1, 1]
    state.particle_cov[p * 6 + 4] = cov_np1[1, 2]
    state.particle_cov[p * 6 + 5] = cov_np1[2, 2]


@wp.func
def update_cov_differentiable(
    state: MPMStateStruct,
    next_state: MPMStateStruct,
    p: int,
    grad_v: wp.mat33,
    dt: float,
):
    cov_n = wp.mat33(0.0)
    cov_n[0, 0] = state.particle_cov[p * 6]
    cov_n[0, 1] = state.particle_cov[p * 6 + 1]
    cov_n[0, 2] = state.particle_cov[p * 6 + 2]
    cov_n[1, 0] = state.particle_cov[p * 6 + 1]
    cov_n[1, 1] = state.particle_cov[p * 6 + 3]
    cov_n[1, 2] = state.particle_cov[p * 6 + 4]
    cov_n[2, 0] = state.particle_cov[p * 6 + 2]
    cov_n[2, 1] = state.particle_cov[p * 6 + 4]
    cov_n[2, 2] = state.particle_cov[p * 6 + 5]

    cov_np1 = cov_n + dt * (grad_v * cov_n + cov_n * wp.transpose(grad_v))

    next_state.particle_cov[p * 6] = cov_np1[0, 0]
    next_state.particle_cov[p * 6 + 1] = cov_np1[0, 1]
    next_state.particle_cov[p * 6 + 2] = cov_np1[0, 2]
    next_state.particle_cov[p * 6 + 3] = cov_np1[1, 1]
    next_state.particle_cov[p * 6 + 4] = cov_np1[1, 2]
    next_state.particle_cov[p * 6 + 5] = cov_np1[2, 2]


@wp.kernel
def p2g_apic_with_stress(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    # input given to p2g:   particle_stress
    #                       particle_x
    #                       particle_v
    #                       particle_C
    # output:               grid_v_in, grid_m
    p = wp.tid()
    if state.particle_selection[p] == 0:
        stress = state.particle_stress[p]
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    dpos = (
                        wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    ) * model.dx
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    dweight = compute_dweight(model, w, dw, i, j, k)

                    C = state.particle_C[p]
                    # if model.rpic = 0, standard apic
                    C = (1.0 - model.rpic_damping) * C + model.rpic_damping / 2.0 * (
                        C - wp.transpose(C)
                    )

                    # C = (1.0 - model.rpic_damping) * state.particle_C[
                    #     p
                    # ] + model.rpic_damping / 2.0 * (
                    #     state.particle_C[p] - wp.transpose(state.particle_C[p])
                    # )

                    if model.rpic_damping < -0.001:
                        # standard pic
                        C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

                    elastic_force = -state.particle_vol[p] * stress * dweight
                    v_in_add = (
                        weight
                        * state.particle_mass[p]
                        * (state.particle_v[p] + C * dpos)
                        + dt * elastic_force
                    )
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


# add gravity
@wp.kernel
def grid_normalization_and_gravity(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    grid_x, grid_y, grid_z = wp.tid()
    if state.grid_m[grid_x, grid_y, grid_z] > 1e-15:
        v_out = state.grid_v_in[grid_x, grid_y, grid_z] * (
            1.0 / state.grid_m[grid_x, grid_y, grid_z]
        )
        # add gravity
        v_out = v_out + dt * model.gravitational_accelaration
        state.grid_v_out[grid_x, grid_y, grid_z] = v_out


@wp.kernel
def g2p(state: MPMStateStruct, model: MPMModelStruct, dt: float):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = new_v + grid_v * weight
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        state.particle_v[p] = new_v
        # state.particle_x[p] = state.particle_x[p] + dt * new_v
        # state.particle_x[p] = state.particle_x[p] + dt * state.particle_v[p]

        # wp.atomic_add(state.particle_x, p, dt * state.particle_v[p]) # old one is this..
        wp.atomic_add(state.particle_x, p, dt * new_v)  # debug
        # new_x = state.particle_x[p] + dt * state.particle_v[p]
        # state.particle_x[p] = new_x

        state.particle_C[p] = new_C

        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        state.particle_F_trial[p] = F_tmp
        # debug for jelly
        # wp.atomic_add(state.particle_F_trial, p, new_F * dt * state.particle_F[p])

        if model.update_cov_with_F:
            update_cov(state, p, new_F, dt)


@wp.kernel
def g2p_differentiable(
    state: MPMStateStruct, next_state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    """
    Compute:
        next_state.particle_v, next_state.particle_x, next_state.particle_C, next_state.particle_F_trial
    """
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_v = wp.vec3(0.0, 0.0, 0.0)
        # new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_C = wp.mat33(new_v, new_v, new_v)
        
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_v = (
                        new_v + grid_v * weight
                    )  # TODO, check gradient from static loop
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        next_state.particle_v[p] = new_v

        # add clip here:
        new_x = state.particle_x[p] + dt * new_v
        dx = 1.0 / model.inv_dx
        a_min = dx * 2.0
        a_max = model.grid_lim - dx * 2.0

        new_x_clamped = wp.vec3(
            wp.clamp(new_x[0], a_min, a_max),
            wp.clamp(new_x[1], a_min, a_max),
            wp.clamp(new_x[2], a_min, a_max),
        )
        next_state.particle_x[p] = new_x_clamped

        # next_state.particle_x[p] = new_x

        next_state.particle_C[p] = new_C

        I33_1 = wp.vec3(1.0, 0.0, 0.0)
        I33_2 = wp.vec3(0.0, 1.0, 0.0)
        I33_3 = wp.vec3(0.0, 0.0, 1.0)
        I33 = wp.mat33(I33_1, I33_2, I33_3)
        F_tmp = (I33 + new_F * dt) * state.particle_F[p]
        next_state.particle_F_trial[p] = F_tmp

        if 0:
            update_cov_differentiable(state, next_state, p, new_F, dt)


@wp.kernel
def clip_particle_x(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()

    posx = state.particle_x[p]
    if state.particle_selection[p] == 0:
        dx = 1.0 / model.inv_dx
        a_min = dx * 2.0
        a_max = model.grid_lim - dx * 2.0
        new_x = wp.vec3(
            wp.clamp(posx[0], a_min, a_max),
            wp.clamp(posx[1], a_min, a_max),
            wp.clamp(posx[2], a_min, a_max),
        )

        state.particle_x[
            p
        ] = new_x  # Warn: this gives wrong gradient, don't use this for backward


# compute (Kirchhoff) stress = stress(returnMap(F_trial))
@wp.kernel
def compute_stress_from_F_trial(
    state: MPMStateStruct, model: MPMModelStruct, dt: float
):
    """
    state.particle_F_trial => state.particle_F   # return mapping
    state.particle_F => state.particle_stress    # stress-strain

    TODO: check the gradient of SVD!  is wp.svd3 differentiable? I guess so
    """
    p = wp.tid()
    if state.particle_selection[p] == 0:
        # apply return mapping
        if model.material == 1:  # metal
            state.particle_F[p] = von_mises_return_mapping(
                state.particle_F_trial[p], model, p
            )
        elif model.material == 2:  # sand
            state.particle_F[p] = sand_return_mapping(
                state.particle_F_trial[p], state, model, p
            )
        elif model.material == 3:  # visplas, with StVk+VM, no thickening
            state.particle_F[p] = viscoplasticity_return_mapping_with_StVK(
                state.particle_F_trial[p], model, p, dt
            )
        elif model.material == 5:
            state.particle_F[p] = von_mises_return_mapping_with_damage(
                state.particle_F_trial[p], model, p
            )
        else:  # elastic, jelly, or neo-hookean
            state.particle_F[p] = state.particle_F_trial[p]

        # also compute stress here
        J = wp.determinant(state.particle_F[p])
        U = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        V = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        sig = wp.vec3(0.0)
        stress = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        wp.svd3(state.particle_F[p], U, sig, V)
        if model.material == 0 or model.material == 5:
            stress = kirchoff_stress_FCR(
                state.particle_F[p], U, V, J, model.mu[p], model.lam[p]
            )
        if model.material == 1:
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 2:
            stress = kirchoff_stress_drucker_prager(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )
        if model.material == 3:
            # temporarily use stvk, subject to change
            stress = kirchoff_stress_StVK(
                state.particle_F[p], U, V, sig, model.mu[p], model.lam[p]
            )

        if model.material == 6:
            stress = kirchoff_stress_neoHookean(
                state.particle_F[p], U, V, J, sig, model.mu[p], model.lam[p]
            )
        # stress = (stress + wp.transpose(stress)) / 2.0  # enfore symmetry
        state.particle_stress[p] = (stress + wp.transpose(stress)) / 2.0


# @wp.kernel
# def compute_cov_from_F(state: MPMStateStruct, model: MPMModelStruct):
#     p = wp.tid()

#     F = state.particle_F_trial[p]

#     init_cov = wp.mat33(0.0)
#     init_cov[0, 0] = state.particle_init_cov[p * 6]
#     init_cov[0, 1] = state.particle_init_cov[p * 6 + 1]
#     init_cov[0, 2] = state.particle_init_cov[p * 6 + 2]
#     init_cov[1, 0] = state.particle_init_cov[p * 6 + 1]
#     init_cov[1, 1] = state.particle_init_cov[p * 6 + 3]
#     init_cov[1, 2] = state.particle_init_cov[p * 6 + 4]
#     init_cov[2, 0] = state.particle_init_cov[p * 6 + 2]
#     init_cov[2, 1] = state.particle_init_cov[p * 6 + 4]
#     init_cov[2, 2] = state.particle_init_cov[p * 6 + 5]

#     cov = F * init_cov * wp.transpose(F)

#     state.particle_cov[p * 6] = cov[0, 0]
#     state.particle_cov[p * 6 + 1] = cov[0, 1]
#     state.particle_cov[p * 6 + 2] = cov[0, 2]
#     state.particle_cov[p * 6 + 3] = cov[1, 1]
#     state.particle_cov[p * 6 + 4] = cov[1, 2]
#     state.particle_cov[p * 6 + 5] = cov[2, 2]


# @wp.kernel
# def compute_R_from_F(state: MPMStateStruct, model: MPMModelStruct):
#     p = wp.tid()

#     F = state.particle_F_trial[p]

#     # polar svd decomposition
#     U = wp.mat33(0.0)
#     V = wp.mat33(0.0)
#     sig = wp.vec3(0.0)
#     wp.svd3(F, U, sig, V)

#     if wp.determinant(U) < 0.0:
#         U[0, 2] = -U[0, 2]
#         U[1, 2] = -U[1, 2]
#         U[2, 2] = -U[2, 2]

#     if wp.determinant(V) < 0.0:
#         V[0, 2] = -V[0, 2]
#         V[1, 2] = -V[1, 2]
#         V[2, 2] = -V[2, 2]

#     # compute rotation matrix
#     R = U * wp.transpose(V)
#     state.particle_R[p] = wp.transpose(R) # particle R is removed


@wp.kernel
def add_damping_via_grid(state: MPMStateStruct, scale: float):
    grid_x, grid_y, grid_z = wp.tid()
    # state.grid_v_out[grid_x, grid_y, grid_z] = (
    #     state.grid_v_out[grid_x, grid_y, grid_z] * scale
    # )
    wp.atomic_sub(
        state.grid_v_out,
        grid_x,
        grid_y,
        grid_z,
        (1.0 - scale) * state.grid_v_out[grid_x, grid_y, grid_z],
    )


@wp.kernel
def apply_additional_params(
    state: MPMStateStruct,
    model: MPMModelStruct,
    params_modifier: MaterialParamsModifier,
):
    p = wp.tid()
    pos = state.particle_x[p]
    if (
        pos[0] > params_modifier.point[0] - params_modifier.size[0]
        and pos[0] < params_modifier.point[0] + params_modifier.size[0]
        and pos[1] > params_modifier.point[1] - params_modifier.size[1]
        and pos[1] < params_modifier.point[1] + params_modifier.size[1]
        and pos[2] > params_modifier.point[2] - params_modifier.size[2]
        and pos[2] < params_modifier.point[2] + params_modifier.size[2]
    ):
        model.E[p] = params_modifier.E
        model.nu[p] = params_modifier.nu
        state.particle_density[p] = params_modifier.density


@wp.kernel
def selection_add_impulse_on_particles(
    state: MPMStateStruct, impulse_modifier: Impulse_modifier
):
    p = wp.tid()
    offset = state.particle_x[p] - impulse_modifier.point
    if (
        wp.abs(offset[0]) < impulse_modifier.size[0]
        and wp.abs(offset[1]) < impulse_modifier.size[1]
        and wp.abs(offset[2]) < impulse_modifier.size[2]
    ):
        impulse_modifier.mask[p] = 1
    else:
        impulse_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_translation(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point
    if (
        wp.abs(offset[0]) < velocity_modifier.size[0]
        and wp.abs(offset[1]) < velocity_modifier.size[1]
        and wp.abs(offset[2]) < velocity_modifier.size[2]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


@wp.kernel
def selection_enforce_particle_velocity_cylinder(
    state: MPMStateStruct, velocity_modifier: ParticleVelocityModifier
):
    p = wp.tid()
    offset = state.particle_x[p] - velocity_modifier.point

    vertical_distance = wp.abs(wp.dot(offset, velocity_modifier.normal))

    horizontal_distance = wp.length(
        offset - wp.dot(offset, velocity_modifier.normal) * velocity_modifier.normal
    )
    if (
        vertical_distance < velocity_modifier.half_height_and_radius[0]
        and horizontal_distance < velocity_modifier.half_height_and_radius[1]
    ):
        velocity_modifier.mask[p] = 1
    else:
        velocity_modifier.mask[p] = 0


@wp.kernel
def compute_position_l2_loss(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    l2 = wp.length(pos - pos_gt)

    wp.atomic_add(loss, 0, l2)


@wp.kernel
def aggregate_grad(x: wp.array(dtype=float), grad: wp.array(dtype=float)):
    tid = wp.tid()

    # gradient descent step
    wp.atomic_add(x, 0, grad[tid])


@wp.kernel
def set_F_C_p2g(
    state: MPMStateStruct, model: MPMModelStruct, target_pos: wp.array(dtype=wp.vec3)
):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        # p2g for displacement
        particle_disp = target_pos[p] - state.particle_x[p]
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    v_in_add = weight * state.particle_mass[p] * particle_disp
                    wp.atomic_add(state.grid_v_in, ix, iy, iz, v_in_add)
                    wp.atomic_add(
                        state.grid_m, ix, iy, iz, weight * state.particle_mass[p]
                    )


@wp.kernel
def set_F_C_g2p(state: MPMStateStruct, model: MPMModelStruct):
    p = wp.tid()
    if state.particle_selection[p] == 0:
        grid_pos = state.particle_x[p] * model.inv_dx
        base_pos_x = wp.int(grid_pos[0] - 0.5)
        base_pos_y = wp.int(grid_pos[1] - 0.5)
        base_pos_z = wp.int(grid_pos[2] - 0.5)
        fx = grid_pos - wp.vec3(
            wp.float(base_pos_x), wp.float(base_pos_y), wp.float(base_pos_z)
        )
        wa = wp.vec3(1.5) - fx
        wb = fx - wp.vec3(1.0)
        wc = fx - wp.vec3(0.5)
        w = wp.mat33(
            wp.cw_mul(wa, wa) * 0.5,
            wp.vec3(0.0, 0.0, 0.0) - wp.cw_mul(wb, wb) + wp.vec3(0.75),
            wp.cw_mul(wc, wc) * 0.5,
        )
        dw = wp.mat33(fx - wp.vec3(1.5), -2.0 * (fx - wp.vec3(1.0)), fx - wp.vec3(0.5))
        new_C = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
        new_F = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)

        # g2p for C and F
        for i in range(0, 3):
            for j in range(0, 3):
                for k in range(0, 3):
                    ix = base_pos_x + i
                    iy = base_pos_y + j
                    iz = base_pos_z + k
                    dpos = wp.vec3(wp.float(i), wp.float(j), wp.float(k)) - fx
                    weight = w[0, i] * w[1, j] * w[2, k]  # tricubic interpolation
                    grid_v = state.grid_v_out[ix, iy, iz]
                    new_C = new_C + wp.outer(grid_v, dpos) * (
                        weight * model.inv_dx * 4.0
                    )
                    dweight = compute_dweight(model, w, dw, i, j, k)
                    new_F = new_F + wp.outer(grid_v, dweight)

        # C should still be zero..
        # state.particle_C[p] = new_C
        I33 = wp.mat33(1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
        F_tmp = I33 + new_F
        state.particle_F_trial[p] = F_tmp

        if model.update_cov_with_F:
            update_cov(state, p, new_F, 1.0)


@wp.kernel
def compute_posloss_with_grad(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    grad: wp.array(dtype=wp.vec3),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_x[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    # l2 = wp.length(pos - (pos_gt - grad[tid] * dt))
    diff = pos - (pos_gt - grad[tid] * dt)
    l2 = wp.dot(diff, diff)
    wp.atomic_add(loss, 0, l2)


@wp.kernel
def compute_veloloss_with_grad(
    mpm_state: MPMStateStruct,
    gt_pos: wp.array(dtype=wp.vec3),
    grad: wp.array(dtype=wp.vec3),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    pos = mpm_state.particle_v[tid]
    pos_gt = gt_pos[tid]

    # l1_diff = wp.abs(pos - pos_gt)
    # l2 = wp.length(pos - (pos_gt - grad[tid] * dt))

    diff = pos - (pos_gt - grad[tid] * dt)
    l2 = wp.dot(diff, diff)
    wp.atomic_add(loss, 0, l2)


@wp.kernel
def compute_Floss_with_grad(
    mpm_state: MPMStateStruct,
    gt_mat: wp.array(dtype=wp.mat33),
    grad: wp.array(dtype=wp.mat33),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    mat_ = mpm_state.particle_F_trial[tid]
    mat_gt = gt_mat[tid]

    mat_gt = mat_gt - grad[tid] * dt
    # l1_diff = wp.abs(pos - pos_gt)
    mat_diff = mat_ - mat_gt

    l2 = wp.ddot(mat_diff, mat_diff)
    # l2 = wp.sqrt(
    #     mat_diff[0, 0] ** 2.0
    #     + mat_diff[0, 1] ** 2.0
    #     + mat_diff[0, 2] ** 2.0
    #     + mat_diff[1, 0] ** 2.0
    #     + mat_diff[1, 1] ** 2.0
    #     + mat_diff[1, 2] ** 2.0
    #     + mat_diff[2, 0] ** 2.0
    #     + mat_diff[2, 1] ** 2.0
    #     + mat_diff[2, 2] ** 2.0
    # )

    wp.atomic_add(loss, 0, l2)


@wp.kernel
def compute_Closs_with_grad(
    mpm_state: MPMStateStruct,
    gt_mat: wp.array(dtype=wp.mat33),
    grad: wp.array(dtype=wp.mat33),
    dt: float,
    loss: wp.array(dtype=float),
):
    tid = wp.tid()

    mat_ = mpm_state.particle_C[tid]
    mat_gt = gt_mat[tid]

    mat_gt = mat_gt - grad[tid] * dt
    # l1_diff = wp.abs(pos - pos_gt)

    mat_diff = mat_ - mat_gt
    l2 = wp.ddot(mat_diff, mat_diff)

    wp.atomic_add(loss, 0, l2)


================================================
FILE: projects/uncleaned_train/thirdparty_code/warp_mpm/warp_utils.py
================================================
import warp as wp
import ctypes
from typing import Optional

from warp.torch import (
    dtype_from_torch,
    device_from_torch,
    dtype_is_compatible,
    from_torch,
)


def from_torch_safe(t, dtype=None, requires_grad=None, grad=None):
    """Wrap a PyTorch tensor to a Warp array without copying the data.

    Args:
        t (torch.Tensor): The torch tensor to wrap.
        dtype (warp.dtype, optional): The target data type of the resulting Warp array. Defaults to the tensor value type mapped to a Warp array value type.
        requires_grad (bool, optional): Whether the resulting array should wrap the tensor's gradient, if it exists (the grad tensor will be allocated otherwise). Defaults to the tensor's `requires_grad` value.

    Returns:
        warp.array: The wrapped array.
    """
    if dtype is None:
        dtype = dtype_from_torch(t.dtype)
    elif not dtype_is_compatible(t.dtype, dtype):
        raise RuntimeError(f"Incompatible data types: {t.dtype} and {dtype}")

    # get size of underlying data type to compute strides
    ctype_size = ctypes.sizeof(dtype._type_)

    shape = tuple(t.shape)
    strides = tuple(s * ctype_size for s in t.stride())

    # if target is a vector or matrix type
    # then check if trailing dimensions match
    # the target type and update the shape
    if hasattr(dtype, "_shape_"):
        dtype_shape = dtype._shape_
        dtype_dims = len(dtype._shape_)
        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
            raise RuntimeError(
                f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
            )

        # ensure the inner strides are contiguous
        stride = ctype_size
        for i in range(dtype_dims):
            if strides[-i - 1] != stride:
                raise RuntimeError(
                    f"Could not convert Torch tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
                )
            stride *= dtype_shape[-i - 1]

        shape = tuple(shape[:-dtype_dims]) or (1,)
        strides = tuple(strides[:-dtype_dims]) or (ctype_size,)

    requires_grad = t.requires_grad if requires_grad is None else requires_grad
    if grad is not None:
        if not isinstance(grad, wp.array):
            import torch

            if isinstance(grad, torch.Tensor):
                grad = from_torch(grad, dtype=dtype)
            else:
                raise ValueError(f"Invalid gradient type: {type(grad)}")
    elif requires_grad:
        # wrap the tensor gradient, allocate if necessary
        if t.grad is None:
            # allocate a zero-filled gradient tensor if it doesn't exist
            import torch

            t.grad = torch.zeros_like(t, requires_grad=False)
        grad = from_torch(t.grad, dtype=dtype)

    a = wp.types.array(
        ptr=t.data_ptr(),
        dtype=dtype,
        shape=shape,
        strides=strides,
        device=device_from_torch(t.device),
        copy=False,
        owner=False,
        grad=grad,
        requires_grad=requires_grad,
    )

    # save a reference to the source tensor, otherwise it will be deallocated
    a._tensor = t
    return a


class MyTape(wp.Tape):
    # returns the adjoint of a kernel parameter
    def get_adjoint(self, a):
        if not wp.types.is_array(a) and not isinstance(a, wp.codegen.StructInstance):
            # if input is a simple type (e.g.: float, vec3, etc) then
            # no gradient needed (we only return gradients through arrays and structs)
            return a

        elif wp.types.is_array(a) and a.grad:
            # keep track of all gradients used by the tape (for zeroing)
            # ignore the scalar loss since we don't want to clear its grad
            self.gradients[a] = a.grad
            return a.grad

        elif isinstance(a, wp.codegen.StructInstance):
            adj = a._cls()
            for name, _ in a._cls.ctype._fields_:
                if name.startswith("_"):
                    continue
                if isinstance(a._cls.vars[name].type, wp.array):
                    arr = getattr(a, name)
                    if arr is None:
                        continue
                    if arr.grad:
                        grad = self.gradients[arr] = arr.grad
                    else:
                        grad = wp.zeros_like(arr)
                    setattr(adj, name, grad)
                else:
                    setattr(adj, name, getattr(a, name))

            self.gradients[a] = adj
            return adj

        return None


# from https://github.com/PingchuanMa/NCLaw/blob/main/nclaw/warp/tape.py
class CondTape(object):
    def __init__(self, tape: Optional[MyTape], cond: bool = True) -> None:
        self.tape = tape
        self.cond = cond

    def __enter__(self):
        if self.tape is not None and self.cond:
            self.tape.__enter__()

    def __exit__(self, exc_type, exc_value, traceback):
        if self.tape is not None and self.cond:
            self.tape.__exit__(exc_type, exc_value, traceback)

================================================
FILE: requirements.txt
================================================
accelerate==0.25.0
decord==0.6.0
einops==0.7.0
fire==0.5.0
imageio==2.34.0
ipython==8.12.3
ipython==8.18.1
jaxtyping==0.2.28
kmeans_gpu==0.0.5
matplotlib==3.7.2
mediapy==1.2.0
numpy==1.24.2
omegaconf==2.1.1
open3d==0.18.0
opencv_python==4.6.0.66
opencv_python_headless==4.9.0.80
Pillow==9.5.0
Pillow==10.3.0
plyfile==1.0.3
point_cloud_utils==0.30.2
pyfqmr==0.2.0
pygltflib==1.16.2
PyMCubes==0.1.4
pymeshlab==2023.12
safetensors==0.3.3
scikit_learn==1.3.2
scipy==1.13.0
simple_knn==0.0.0
torch==2.2.2+cu121
torchvision==0.17.2
tqdm==4.65.0
trimesh==4.0.8
warp_lang==0.10.1
xatlas==0.0.9


================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages

setup(
    name="physdreamer",
    version="0.0.1",
    packages=find_packages(),
)