Repository: HLinChen/VCR-GauS Branch: main Commit: aa715d19bfac Files: 99 Total size: 474.8 KB Directory structure: gitextract_oa0m8dnw/ ├── .gitmodules ├── LICENSE.md ├── README.md ├── arguments/ │ └── __init__.py ├── bash_scripts/ │ ├── 0_train.sh │ ├── 1_preprocess_tnt.sh │ ├── 2_extract_normal_dsine.sh │ ├── 3_extract_mask.sh │ ├── 4_extract_normal_geow.sh │ ├── convert.sh │ └── install.sh ├── configs/ │ ├── 360_v2/ │ │ └── base.yaml │ ├── config.py │ ├── config_base.yaml │ ├── dtu/ │ │ ├── base.yaml │ │ └── dtu_scan24.yaml │ ├── reconstruct.yaml │ ├── scannetpp/ │ │ └── base.yaml │ └── tnt/ │ ├── Barn.yaml │ ├── Caterpillar.yaml │ ├── Courthouse.yaml │ ├── Ignatius.yaml │ ├── Meetingroom.yaml │ ├── Truck.yaml │ └── base.yaml ├── environment.yml ├── evaluation/ │ ├── crop_mesh.py │ ├── eval_dtu/ │ │ ├── eval.py │ │ ├── evaluate_single_scene.py │ │ └── render_utils.py │ ├── eval_tnt.py │ ├── full_eval.py │ ├── lpipsPyTorch/ │ │ ├── __init__.py │ │ └── modules/ │ │ ├── lpips.py │ │ ├── networks.py │ │ └── utils.py │ ├── metrics.py │ ├── render.py │ └── tnt_eval/ │ ├── README.md │ ├── config.py │ ├── evaluation.py │ ├── plot.py │ ├── registration.py │ ├── requirements.txt │ ├── run.py │ ├── trajectory_io.py │ └── util.py ├── gaussian_renderer/ │ ├── __init__.py │ └── network_gui.py ├── process_data/ │ ├── convert.py │ ├── convert_360_to_json.py │ ├── convert_data_to_json.py │ ├── convert_dtu_to_json.py │ ├── convert_tnt_to_json.py │ ├── extract_mask.py │ ├── extract_normal.py │ ├── extract_normal_geo.py │ ├── visualize_colmap.ipynb │ └── visualize_transforms.ipynb ├── pyproject.toml ├── python_scripts/ │ ├── run_base.py │ ├── run_dtu.py │ ├── run_mipnerf360.py │ ├── run_tnt.py │ ├── show_360.py │ ├── show_dtu.py │ └── show_tnt.py ├── requirements.txt ├── scene/ │ ├── __init__.py │ ├── appearance_network.py │ ├── cameras.py │ ├── colmap_loader.py │ ├── dataset_readers.py │ └── gaussian_model.py ├── tools/ │ ├── __init__.py │ ├── camera.py │ ├── camera_utils.py │ ├── crop_mesh.py │ ├── denoise_pcd.py │ ├── depth2mesh.py │ ├── distributed.py │ ├── general_utils.py │ ├── graphics_utils.py │ ├── image_utils.py │ ├── loss_utils.py │ ├── math_utils.py │ ├── mcube_utils.py │ ├── mesh_utils.py │ ├── normal_utils.py │ ├── prune.py │ ├── render_utils.py │ ├── semantic_id.py │ ├── sh_utils.py │ ├── system_utils.py │ ├── termcolor.py │ ├── visualization.py │ └── visualize.py ├── train.py └── trainer.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitmodules ================================================ [submodule "submodules/simple-knn"] path = submodules/simple-knn url = https://gitlab.inria.fr/bkerbl/simple-knn.git [submodule "submodules/diff-gaussian-rasterization"] path = submodules/diff-gaussian-rasterization url = https://github.com/HLinChen/diff-gaussian-rasterization [submodule "SIBR_viewers"] path = SIBR_viewers url = https://gitlab.inria.fr/sibr/sibr_core.git [submodule "submodules/colmap"] path = submodules/colmap url = https://github.com/colmap/colmap.git ================================================ FILE: LICENSE.md ================================================ Gaussian-Splatting License =========================== **Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**. The *Software* is in the process of being registered with the Agence pour la Protection des Programmes (APP). The *Software* is still being developed by the *Licensor*. *Licensor*'s goal is to allow the research community to use, test and evaluate the *Software*. ## 1. Definitions *Licensee* means any person or entity that uses the *Software* and distributes its *Work*. *Licensor* means the owners of the *Software*, i.e Inria and MPII *Software* means the original work of authorship made available under this License ie gaussian-splatting. *Work* means the *Software* and any additions to or derivative works of the *Software* that are made available under this License. ## 2. Purpose This license is intended to define the rights granted to the *Licensee* by Licensors under the *Software*. ## 3. Rights granted For the above reasons Licensors have decided to distribute the *Software*. Licensors grant non-exclusive rights to use the *Software* for research purposes to research users (both academic and industrial), free of charge, without right to sublicense.. The *Software* may be used "non-commercially", i.e., for research and/or evaluation purposes only. Subject to the terms and conditions of this License, you are granted a non-exclusive, royalty-free, license to reproduce, prepare derivative works of, publicly display, publicly perform and distribute its *Work* and any resulting derivative works in any form. ## 4. Limitations **4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do so under this License, (b) you include a complete copy of this License with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the *Work*. **4.2 Derivative Works.** You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the *Work* ("Your Terms") only if (a) Your Terms provide that the use limitation in Section 2 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution requirements in Section 3.1) will continue to apply to the *Work* itself. **4.3** Any other use without of prior consent of Licensors is prohibited. Research users explicitly acknowledge having received from Licensors all information allowing to appreciate the adequacy between of the *Software* and their needs and to undertake all necessary precautions for its execution and use. **4.4** The *Software* is provided both as a compiled library file and as source code. In case of using the *Software* for a publication or other results obtained through the use of the *Software*, users are strongly encouraged to cite the corresponding publications as explained in the documentation of the *Software*. ## 5. Disclaimer THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*. ================================================ FILE: README.md ================================================

VCR-GauS: View Consistent Depth-Normal Regularizer for Gaussian Surface Reconstruction

Hanlin Chen, Fangyin Wei, Chen Li, Tianxin Huang, Yunsong Wang, Gim Hee Lee

NeurIPS 2024

arXiv | Project Page

Logo

VCR-GauS formulates a novel multi-view D-Normal regularizer that enables full optimization of the Gaussian geometric parameters to achieve better surface reconstruction. We further design a confidence term to weigh our D-Normal regularizer to mitigate inconsistencies of normal predictions across multiple views.


# Updates * **[2024.10.31]**: We uploaded a new version to arXiv, adding theoretical proofs and visualization results for the D-Normal Regularizer. * **[2024.09.24]**: VCR-GauS is accepted to NeurIPS 2024. # Installation Clone the repository and create an anaconda environment using ``` git clone https://github.com/HLinChen/VCR-GauS.git --recursive cd VCR-GauS git pull --recurse-submodules env=vcr conda create -n $env -y python=3.10 conda activate $env pip install -e ".[train]" # you can specify your own cuda path export CUDA_HOME=/usr/local/cuda-11.8 pip install -r requirements.txt ``` We also uploaded a built anaconda environment [here](https://huggingface.co/hanlin-chen/VCR-GauS/resolve/main/vcr.zip?download=true); you can download it and unzip and put it in your_anaconda_path/envs/ . For eval TNT with the official scripts, you need to build a new environment with open3d==0.10: ``` env=f1eval conda create -n $env -y python=3.8 conda activate $env pip install -e ".[f1eval]" ``` For extract normal maps based on [DSINE](https://baegwangbin.github.io/DSINE/), you need to build a new environment: ``` conda create --name dsine python=3.10 conda activate dsine conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia python -m pip install geffnet ``` Similar to Gaussian Splatting, we also use colmap to process data and you can follow [COLMAP website](https://colmap.github.io/) to install it. # Dataset ## Tanks and Temples dataset You can download the proprocessed Tanks and Temples dataset from [here](https://huggingface.co/hanlin-chen/VCR-GauS/resolve/main/tnt.zip?download=true). Or proprocess it by your self: Download the data from [Tanks and Temples](https://tanksandtemples.org/download/) website. You will also need to download additional [COLMAP/camera/alignment](https://drive.google.com/file/d/1jAr3IDvhVmmYeDWi0D_JfgiHcl70rzVE/view?resourcekey=) and the images of each scene. The file structure should look like (you need to move the downloaded images to folder `images_raw`): ``` tanks_and_temples ├─ Barn │ ├─ Barn_COLMAP_SfM.log (camera poses) │ ├─ Barn.json (cropfiles) │ ├─ Barn.ply (ground-truth point cloud) │ ├─ Barn_trans.txt (colmap-to-ground-truth transformation) │ └─ images_raw (raw input images downloaded from Tanks and Temples website) │ ├─ 000001.png │ ├─ 000002.png │ ... ├─ Caterpillar │ ├─ ... ... ``` #### 1. Colmap and bounding box json Run the following command to generate json and colmap files: ```bash # Modify --tnt_path to be the Tanks and Temples root directory. sh bash_scripts/1_preprocess_tnt.sh ``` #### 2. Normal maps You need to download the [code](https://github.com/baegwangbin/DSINE) and [model weight](https://drive.google.com/drive/folders/1t3LMJIIrSnCGwOEf53Cyg0lkSXd3M4Hm) of DSINE first. Then, modify **CODE_PATH** to be the DSINE root directory, **CKPT** to be the DSINE model path, **DATADIR** to be the TNT root directory in the bash script. Run the following command to generate normal maps: ```bash sh bash_scripts/2_extract_normal_dsine.sh ``` #### 3. Semantic masks (optional) If you don't want to use the semantic masks, you can set **optim.loss_weight.semantic=0** and skip the mask generation. You need to download the [code](https://github.com/IDEA-Research/Grounded-Segment-Anything) and model of Grounded-SAM first. Then, install the environment based on 'Install without Docker' in the [webside](https://github.com/IDEA-Research/Grounded-Segment-Anything). Next, modify **GSAM_PATH** to be the GSAM root directory, **DATADIR** to be the TNT root directory in the bash script. Run the following command to generate semantic masks: ```bash sh bash_scripts/3_extract_mask.sh ``` ## Other datasets Please download the Mip-NeRF 360 dataset from the official [webiste](https://jonbarron.info/mipnerf360/), the preprocessed DTU dataset from [2DGS](https://drive.google.com/drive/folders/1SJFgt8qhQomHX55Q4xSvYE2C6-8tFll9). And extract normal maps with DSINE following the above scripts. You can also use [GeoWizard](https://github.com/fuxiao0719/GeoWizard) to extract normal maps by following the script: 'bash_scripts/4_extract_normal_geow.sh', and please install the corresponding environment and download the code as well as model weights first. # Training and Evaluation ## From the scratch: ``` # you might need to update the data path in the script accordingly # Tanks and Temples dataset python python_scripts/run_tnt.py # Mip-NeRF 360 dataset python python_scripts/run_mipnerf360.py ``` ## Only eval the metrics We have uploaded the extracted meshes, you can download and eval them by yourselves ([TNT](https://huggingface.co/hanlin-chen/VCR-GauS/resolve/main/tnt_mesh.zip?download=true) and [DTU](https://huggingface.co/Chiller3/VCR-GauS/resolve/main/dtu_mesh.zip?download=true)). You might need to update the **mesh and data path** in the script accordingly. And set **do_train** and **do_extract_mesh** to be False. ``` # Tanks and Temples dataset python python_scripts/run_tnt.py # DTU dataset python python_scripts/run_dtu.py ``` ## Additional regularizations: We also incorporate some regularizations, like depth distortion loss and normal consistency loss, following [2DGS](https://surfsplatting.github.io/) and [GOF](https://niujinshuchong.github.io/gaussian-opacity-fields/). You can play with it by: - normal consistency loss: setting optim.loss_weight.consistent_normal > 0; - depth distortion loss: 1. set optim.loss_weight.depth_var > 0 2. set NUM_DIST = 1 in submodules/diff-gaussian-rasterization/cuda_rasterizer/config.h, and reinstall diff-gaussian-rasterization # Custom Dataset We use the same data format from 3DGS, please follow [here](https://github.com/graphdeco-inria/gaussian-splatting?tab=readme-ov-file#processing-your-own-scenes) to prepare the your dataset. Then you can train your model and extract a mesh. ``` # Generate bounding box python process_data/convert_data_to_json.py \ --scene_type outdoor \ --data_dir /your/data/path # Extract normal maps # Use DSINE: python -W ignore process_data/extract_normal.py \ --dsine_path /your/dsine/code/path \ --ckpt /your/ckpt/path \ --img_path /your/data/path/images \ --intrins_path /your/data/path/ \ --output_path /your/data/path/normals # Or use GeoWizard python process_data/extract_normal_geo.py \ --code_path ${CODE_PATH} \ --input_dir /your/data/path/images/ \ --output_dir /your/data/path/ \ --ensemble_size 3 \ --denoise_steps 10 \ --seed 0 \ --domain ${DOMAIN_TYPE} # outdoor indoor object # training # --model.resolution=2 for using downsampled images with factor 2 # --model.use_decoupled_appearance=True to enable decoupled appearance modeling if your images has changing lighting conditions python train.py \ --config=configs/reconstruct.yaml \ --logdir=/your/log/path/ \ --model.source_path=/your/data/path/ \ --model.data_device=cpu \ --model.resolution=2 \ --wandb \ --wandb_name vcr-gaus" # extract the mesh after training python tools/depth2mesh.py \ --voxel_size 5e-3 \ --max_depth 8 \ --clean \ --cfg_path /your/gaussian/path/config.yaml" ``` # Acknowledgements This project is built upon [3DGS](https://github.com/graphdeco-inria/gaussian-splatting). Evaluation scripts for DTU and Tanks and Temples dataset are taken from [DTUeval-python](https://github.com/jzhangbs/DTUeval-python) and [TanksAndTemples](https://github.com/isl-org/TanksAndTemples/tree/master/python_toolbox/evaluation) respectively. We also utilize the normal estimation [DSINE](https://github.com/baegwangbin/DSINE) as well as [GeoWizard](https://fuxiao0719.github.io/projects/geowizard/), and semantic segmentation [SAM](https://github.com/facebookresearch/segment-anything) and [Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything?tab=readme-ov-file#install-without-docker). In addition, we use the pruning method in [LightGaussin](https://lightgaussian.github.io/). We thank all the authors for their great work and repos. # Citation If you find our code or paper useful, please cite ```bibtex @article{chen2024vcr, author = {Chen, Hanlin and Wei, Fangyin and Li, Chen and Huang, Tianxin and Wang, Yunsong and Lee, Gim Hee}, title = {VCR-GauS: View Consistent Depth-Normal Regularizer for Gaussian Surface Reconstruction}, journal = {arXiv preprint arXiv:2406.05774}, year = {2024}, } ``` If you find the flatten 3D Gaussian useful, please kindly cite ```bibtex @article{chen2023neusg, title={Neusg: Neural implicit surface reconstruction with 3d gaussian splatting guidance}, author={Chen, Hanlin and Li, Chen and Lee, Gim Hee}, journal={arXiv preprint arXiv:2312.00846}, year={2023} } ``` ================================================ FILE: arguments/__init__.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # from argparse import ArgumentParser, Namespace import sys import os class GroupParams: pass class ParamGroup: def __init__(self, parser: ArgumentParser, name : str, fill_none = False): group = parser.add_argument_group(name) for key, value in vars(self).items(): shorthand = False if key.startswith("_"): shorthand = True key = key[1:] t = type(value) value = value if not fill_none else None if shorthand: if t == bool: group.add_argument("--" + key, ("-" + key[0:1]), default=value, action="store_true") else: group.add_argument("--" + key, ("-" + key[0:1]), default=value, type=t) else: if t == bool: group.add_argument("--" + key, default=value, action="store_true") else: group.add_argument("--" + key, default=value, type=t) def extract(self, args): group = GroupParams() for arg in vars(args).items(): if arg[0] in vars(self) or ("_" + arg[0]) in vars(self): setattr(group, arg[0], arg[1]) return group class ModelParams(ParamGroup): def __init__(self, parser, sentinel=False): self.sh_degree = 3 self._source_path = "" self._model_path = "" self._images = "images" self._resolution = -1 self._white_background = False self.data_device = "cuda" self.eval = False super().__init__(parser, "Loading Parameters", sentinel) def extract(self, args): g = super().extract(args) g.source_path = os.path.abspath(g.source_path) return g class PipelineParams(ParamGroup): def __init__(self, parser): self.convert_SHs_python = False self.compute_cov3D_python = False self.debug = False super().__init__(parser, "Pipeline Parameters") class OptimizationParams(ParamGroup): def __init__(self, parser): self.iterations = 30_000 self.position_lr_init = 0.00016 self.position_lr_final = 0.0000016 self.position_lr_delay_mult = 0.01 self.position_lr_max_steps = 30_000 self.feature_lr = 0.0025 self.opacity_lr = 0.05 self.scaling_lr = 0.005 self.rotation_lr = 0.001 self.percent_dense = 0.01 self.lambda_dssim = 0.2 self.densification_interval = 100 self.opacity_reset_interval = 3000 self.densify_from_iter = 500 self.densify_until_iter = 15_000 self.densify_grad_threshold = 0.0002 self.random_background = False super().__init__(parser, "Optimization Parameters") def get_combined_args(parser : ArgumentParser): cmdlne_string = sys.argv[1:] cfgfile_string = "Namespace()" args_cmdline = parser.parse_args(cmdlne_string) try: cfgfilepath = os.path.join(args_cmdline.model_path, "cfg_args") print("Looking for config file in", cfgfilepath) with open(cfgfilepath) as cfg_file: print("Config file found: {}".format(cfgfilepath)) cfgfile_string = cfg_file.read() except TypeError: print("Config file not found at") pass args_cfgfile = eval(cfgfile_string) merged_dict = vars(args_cfgfile).copy() for k,v in vars(args_cmdline).items(): if v != None: merged_dict[k] = v return Namespace(**merged_dict) ================================================ FILE: bash_scripts/0_train.sh ================================================ GPU=0 export CUDA_VISIBLE_DEVICES=${GPU} ls DATASET=tnt SCENE=Barn NAME=${SCENE} PROJECT=vcr_gaus TRIAL_NAME=vcr_gaus CFG=configs/${DATASET}/${SCENE}.yaml DIR=/your/log/path/${PROJECT}/${DATASET}/${NAME}/${TRIAL_NAME} python train.py \ --config=${CFG} \ --port=-1 \ --logdir=${DIR} \ --model.source_path=/your/data/path/${DATASET}/${SCENE}/ \ --model.resolution=1 \ --model.data_device=cpu \ --wandb \ --wandb_name ${PROJECT} ================================================ FILE: bash_scripts/1_preprocess_tnt.sh ================================================ echo "Compute intrinsics, undistort images and generate json files. This may take a while" python process_data/convert_tnt_to_json.py \ --tnt_path /your/data/path \ --run_colmap \ --export_json ================================================ FILE: bash_scripts/2_extract_normal_dsine.sh ================================================ export CUDA_VISIBLE_DEVICES=0 DOMAIN_TYPE=indoor DATADIR=/your/data/path CODE_PATH=/your/dsine/code/path CKPT=/your/dsine/code/path/checkpoints/dsine.pt for SCENE in Barn Caterpillar Courthouse Ignatius Meetingroom Truck; do SCENE_PATH=${DATADIR}/${SCENE} # dsine python -W ignore process_data/extract_normal.py \ --dsine_path ${CODE_PATH} \ --ckpt ${CKPT} \ --img_path ${SCENE_PATH}/images \ --intrins_path ${SCENE_PATH}/ \ --output_path ${SCENE_PATH}/normals done ================================================ FILE: bash_scripts/3_extract_mask.sh ================================================ export CUDA_VISIBLE_DEVICES=0 DATADIR=/your/data/path GSAM_PATH=~/code/gsam CKPT_PATH=${GSAM_PATH} for SCENE in Barn Caterpillar Courthouse Ignatius Meetingroom Truck; do SCENE_PATH=${DATADIR}/${SCENE} # meething room scene_tye: indoor, others: outdoor if [ ${SCENE} = "Meetingroom" ]; then SCENE_TYPE="indoor" else SCENE_TYPE="outdoor" fi python -W ignore process_data/extract_mask.py \ --config ${GSAM_PATH}/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py \ --grounded_checkpoint ${CKPT_PATH}/groundingdino_swint_ogc.pth \ --sam_hq_checkpoint ${CKPT_PATH}/sam_hq_vit_h.pth \ --gsam_path ${GSAM_PATH} \ --use_sam_hq \ --input_image ${SCENE_PATH}/images/ \ --output_dir ${SCENE_PATH}/masks \ --box_threshold 0.5 \ --text_threshold 0.2 \ --scene ${SCENE} \ --scene_type ${SCENE_TYPE} \ --device "cuda" done ================================================ FILE: bash_scripts/4_extract_normal_geow.sh ================================================ export CUDA_VISIBLE_DEVICES=0 # DOMAIN_TYPE=outdoor # DOMAIN_TYPE=indoor DOMAIN_TYPE=object DATADIR=/your/data/path/DTU_mask CODE_PATH=/your/geowizard/path for SCENE in scan106 scan114 scan122 scan37 scan55 scan65 scan83 scan105 scan110 scan118 scan24 scan40 scan63 scan69 scan97; do SCENE_PATH=${DATADIR}/${SCENE} python process_data/extract_normal_geo.py \ --code_path ${CODE_PATH} \ --input_dir ${SCENE_PATH}/images/ \ --output_dir ${SCENE_PATH}/ \ --ensemble_size 3 \ --denoise_steps 10 \ --seed 0 \ --domain ${DOMAIN_TYPE} done ================================================ FILE: bash_scripts/convert.sh ================================================ SCENE=Truck DATA_ROOT=/your/data/path/${SCENE} python convert.py -s $DATA_ROOT # [--resize] #If not resizing, ImageMagick is not needed ================================================ FILE: bash_scripts/install.sh ================================================ env=vcr conda create -n $env -y python=3.10 conda activate $env pip install -e ".[train]" export CUDA_HOME=/usr/local/cuda-11.2 pip install -r requirements.txt ================================================ FILE: configs/360_v2/base.yaml ================================================ _parent_: configs/reconstruct.yaml model: eval: True llffhold: 8 split: False optim: mask_depth_thr: 1 densify_large: percent_dense: 5e-2 sample_cams: random: False num: 100 loss_weight: semantic: 0 l1_scale: 1 ================================================ FILE: configs/config.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import collections import functools import os import re import yaml from tools.distributed import master_only_print as print from tools.termcolor import cyan, green, yellow DEBUG = False USE_JIT = False class AttrDict(dict): """Dict as attribute trick.""" def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self for key, value in self.__dict__.items(): if isinstance(value, dict): self.__dict__[key] = AttrDict(value) elif isinstance(value, (list, tuple)): if value and isinstance(value[0], dict): self.__dict__[key] = [AttrDict(item) for item in value] else: self.__dict__[key] = value def yaml(self): """Convert object to yaml dict and return.""" yaml_dict = {} for key, value in self.__dict__.items(): if isinstance(value, AttrDict): yaml_dict[key] = value.yaml() elif isinstance(value, list): if value and isinstance(value[0], AttrDict): new_l = [] for item in value: new_l.append(item.yaml()) yaml_dict[key] = new_l else: yaml_dict[key] = value else: yaml_dict[key] = value return yaml_dict def __repr__(self): """Print all variables.""" ret_str = [] for key, value in self.__dict__.items(): if isinstance(value, AttrDict): ret_str.append('{}:'.format(key)) child_ret_str = value.__repr__().split('\n') for item in child_ret_str: ret_str.append(' ' + item) elif isinstance(value, list): if value and isinstance(value[0], AttrDict): ret_str.append('{}:'.format(key)) for item in value: # Treat as AttrDict above. child_ret_str = item.__repr__().split('\n') for item in child_ret_str: ret_str.append(' ' + item) else: ret_str.append('{}: {}'.format(key, value)) else: ret_str.append('{}: {}'.format(key, value)) return '\n'.join(ret_str) class Config(AttrDict): r"""Configuration class. This should include every human specifiable hyperparameter values for your training.""" def __init__(self, filename=None, verbose=False): super(Config, self).__init__() self.source_filename = filename # Load the base configuration file. base_filename = os.path.join( os.path.dirname(__file__), './config_base.yaml' ) cfg_base = self.load_config(base_filename) recursive_update(self, cfg_base) # Update with given configurations. cfg_dict = self.load_config(filename) recursive_update(self, cfg_dict) if verbose: print(' imaginaire config '.center(80, '-')) print(self.__repr__()) print(''.center(80, '-')) def load_config(self, filename): # Update with given configurations. assert os.path.exists(filename), f'File {filename} not exist.' yaml_loader = yaml.SafeLoader yaml_loader.add_implicit_resolver( u'tag:yaml.org,2002:float', re.compile(u'''^(?: [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)? |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+) |\\.[0-9_]+(?:[eE][-+][0-9]+)? |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]* |[-+]?\\.(?:inf|Inf|INF) |\\.(?:nan|NaN|NAN))$''', re.X), list(u'-+0123456789.')) try: with open(filename) as file: cfg_dict = yaml.load(file, Loader=yaml_loader) cfg_dict = AttrDict(cfg_dict) except EnvironmentError: print(f'Please check the file with name of "{filename}"') # Inherit configurations from parent parent_key = "_parent_" if parent_key in cfg_dict: parent_filename = cfg_dict.pop(parent_key) cfg_parent = self.load_config(parent_filename) recursive_update(cfg_parent, cfg_dict) cfg_dict = cfg_parent return cfg_dict def print_config(self, level=0): """Recursively print the configuration (with termcolor).""" for key, value in sorted(self.items()): if isinstance(value, (dict, Config)): print(" " * level + cyan("* ") + green(key) + ":") Config.print_config(value, level + 1) else: print(" " * level + cyan("* ") + green(key) + ":", yellow(value)) def save_config(self, logdir): """Save the final configuration to a yaml file.""" cfg_fname = f"{logdir}/config.yaml" with open(cfg_fname, "w") as file: yaml.safe_dump(self.yaml(), file, default_flow_style=False, indent=4) def rsetattr(obj, attr, val): """Recursively find object and set value""" pre, _, post = attr.rpartition('.') return setattr(rgetattr(obj, pre) if pre else obj, post, val) def rgetattr(obj, attr, *args): """Recursively find object and return value""" def _getattr(obj, attr): r"""Get attribute.""" return getattr(obj, attr, *args) return functools.reduce(_getattr, [obj] + attr.split('.')) def recursive_update(d, u): """Recursively update AttrDict d with AttrDict u""" for key, value in u.items(): if isinstance(value, collections.abc.Mapping): d.__dict__[key] = recursive_update(d.get(key, AttrDict({})), value) elif isinstance(value, (list, tuple)): if value and isinstance(value[0], dict): d.__dict__[key] = [AttrDict(item) for item in value] else: d.__dict__[key] = value else: d.__dict__[key] = value return d def recursive_update_strict(d, u, stack=[]): """Recursively update AttrDict d with AttrDict u with strict matching""" for key, value in u.items(): if key not in d: key_full = ".".join(stack + [key]) raise KeyError(f"The input key '{key_full}; does not exist in the config files.") if isinstance(value, collections.abc.Mapping): d.__dict__[key] = recursive_update_strict(d.get(key, AttrDict({})), value, stack + [key]) elif isinstance(value, (list, tuple)): if value and isinstance(value[0], dict): d.__dict__[key] = [AttrDict(item) for item in value] else: d.__dict__[key] = value else: d.__dict__[key] = value return d def parse_cmdline_arguments(args): """ Parse arguments from command line. Syntax: --key1.key2.key3=value --> value --key1.key2.key3= --> None --key1.key2.key3 --> True --key1.key2.key3! --> False """ cfg_cmd = {} for arg in args: assert arg.startswith("--") if "=" not in arg[2:]: key_str, value = (arg[2:-1], "false") if arg[-1] == "!" else (arg[2:], "true") else: key_str, value = arg[2:].split("=") keys_sub = key_str.split(".") cfg_sub = cfg_cmd for k in keys_sub[:-1]: cfg_sub.setdefault(k, {}) cfg_sub = cfg_sub[k] assert keys_sub[-1] not in cfg_sub, keys_sub[-1] cfg_sub[keys_sub[-1]] = yaml.safe_load(value) return cfg_cmd ================================================ FILE: configs/config_base.yaml ================================================ logdir: "/your/log/path/debug/" ip: 127.0.0.1 port: -1 detect_anomaly: False silent: 0 seed: 0 model: sh_degree: 3 source_path: "/your/data/path/tnt/Barn/" model_path: "/your/log/path/" images: "images" resolution: -1 white_background: False data_device: "cuda" eval: False llffhold: 1 init_ply: "sparse/points3D.ply" max_init_points: split: False sphere: False load_depth: False load_normal: False load_mask: False normal_folder: 'normals' depth_folder: 'depths' use_decoupled_appearance: False ch_sem_feat: 0 num_cls: 0 max_mem: 22 load_mask: False use_decoupled_appearance: False use_decoupled_dnormal: False ratio: 0 mesh: voxel_size: 3e-3 depth_type: 'traditional' optim: iterations: 30000 position_lr_init: 0.00016 position_lr_final: 0.0000016 position_lr_delay_mult: 0.01 position_lr_max_steps: 30000 feature_lr: 0.0025 sdf_lr: 0.001 weight_decay: 1e-2 opacity_lr: 0.05 scaling_lr: 0.005 rotation_lr: 0.001 appearance_embeddings_lr: 0.001 appearance_network_lr: 0.001 cls_lr: 5e-4 percent_dense: 0.01 densification_interval: 100 opacity_reset_interval: 3000 densify_from_iter: 500 densify_until_iter: 15000 densify_grad_threshold: 0.0005 random_background: False rand_pts: 20000 edge_thr: 0 mask_depth_thr: 0 loss_weight: l1: 0.8 ssim: 0.2 distortion: 0. semantic: 0 mono_depth: 0 mono_normal: 0 depth_normal: 0 prune: iterations: [] percent: 0.5 decay: 0.6 v_pow: 0.1 pipline: convert_SHs_python: False compute_cov3D_python: False debug: False data: name: dummy train: test_iterations: [7000, 30000] save_iterations: [7000, 30000] checkpoint_iterations: [30000] save_splat: False start_checkpoint: debug_from: -1 ================================================ FILE: configs/dtu/base.yaml ================================================ _parent_: configs/reconstruct.yaml model: use_decoupled_appearance: False use_decoupled_dnormal: False normal_folder: 'normal_npz_indoor' eval: False optim: exp_t: 0.01 mask_depth_thr: 0 loss_weight: l1_scale: 0.5 consistent_normal_from_iter: 15000 close_depth_from_iter: 15000 densify_large: percent_dense: 1e-2 sample_cams: random: False num: 30 loss_weight: semantic: 0 depth_normal: 0 mono_normal: 0.01 consistent_normal: 0.05 distortion: 1000 depth_var: 0 random_background: False ================================================ FILE: configs/dtu/dtu_scan24.yaml ================================================ _parent_: configs/dtu/base.yaml ================================================ FILE: configs/reconstruct.yaml ================================================ _parent_: configs/config_base.yaml model: load_mask: False use_decoupled_appearance: False use_decoupled_dnormal: False ch_sem_feat: 2 num_cls: 2 depth_type: 'intersection' optim: mask_depth_thr: 0.8 edge_thr: 0 exp_t: 0.01 cos_thr: -1 close_depth_from_iter: 0 normal_from_iter: 0 dnormal_from_iter: 0 consistent_normal_from_iter: 0 curv_from_iter: 0 loss_weight: l1: 0.8 ssim: 0.2 l1_scale: 1 entropy: 0 depth_var: 0. mono_depth: 0 mono_normal: 0.01 depth_normal: 0.01 consistent_normal: 0 prune: iterations: [15000, 25000] percent: 0.5 decay: 0.6 v_pow: 0.1 densify_large: percent_dense: 2e-3 interval: 1 sample_cams: random: True num: 200 up: True around: True look_mode: 'target' random_background: True train: checkpoint_iterations: [] save_mesh: False save_iterations: [30000] ================================================ FILE: configs/scannetpp/base.yaml ================================================ _parent_: configs/reconstruct.yaml model: split: True eval: True use_decoupled_appearance: False use_decoupled_dnormal: False mesh: voxel_size: 1.5e-2 optim: mask_depth_thr: 0 curv_from_iter: 15000 densify_large: percent_dense: 1e-2 sample_cams: random: False loss_weight: semantic: 0 curv: 0.05 ================================================ FILE: configs/tnt/Barn.yaml ================================================ _parent_: configs/tnt/base.yaml ================================================ FILE: configs/tnt/Caterpillar.yaml ================================================ _parent_: configs/tnt/base.yaml ================================================ FILE: configs/tnt/Courthouse.yaml ================================================ _parent_: configs/tnt/base.yaml ================================================ FILE: configs/tnt/Ignatius.yaml ================================================ _parent_: configs/tnt/base.yaml ================================================ FILE: configs/tnt/Meetingroom.yaml ================================================ _parent_: configs/tnt/base.yaml optim: exp_t: 1e-3 mask_depth_thr: 0 densify_large: percent_dense: 5e-3 sample_cams: random: False loss_weight: semantic: 0 model: num_cls: 3 use_decoupled_appearance: False ================================================ FILE: configs/tnt/Truck.yaml ================================================ _parent_: configs/tnt/base.yaml ================================================ FILE: configs/tnt/base.yaml ================================================ _parent_: configs/reconstruct.yaml model: use_decoupled_appearance: True use_decoupled_dnormal: False eval: False llffhold: 5 optim: exp_t: 5e-3 loss_weight: depth_normal: 0.015 semantic: 0.005 l1_scale: 1 ================================================ FILE: environment.yml ================================================ name: fast_render channels: - pytorch - nvidia - conda-forge - defaults dependencies: - python=3.10 - pytorch==2.0.1 - torchvision==0.15.2 - torchaudio==2.0.2 - pytorch-cuda=11.8 - pip: - open3d - plyfile - ninja - GPUtil - opencv-python - lpips - trimesh - pymeshlab - termcolor - wandb - imageio - scikit-image - torchmetrics - mediapy - "git+https://github.com/facebookresearch/pytorch3d.git" - submodules/diff-gaussian-rasterization - submodules/simple-knn ================================================ FILE: evaluation/crop_mesh.py ================================================ import os import json import plyfile import argparse # import open3d as o3d import numpy as np # from tqdm import tqdm import trimesh from sklearn.cluster import DBSCAN def align_gt_with_cam(pts, trans): trans_inv = np.linalg.inv(trans) pts_aligned = pts @ trans_inv[:3, :3].transpose(-1, -2) + trans_inv[:3, -1] return pts_aligned def main(args): assert os.path.exists(args.ply_path), f"PLY file {args.ply_path} does not exist." gt_trans = np.loadtxt(args.align_path) mesh_rec = trimesh.load(args.ply_path, process=False) mesh_gt = trimesh.load(args.gt_path, process=False) mesh_gt.vertices = align_gt_with_cam(mesh_gt.vertices, gt_trans) to_align, _ = trimesh.bounds.oriented_bounds(mesh_gt) mesh_gt.vertices = (to_align[:3, :3] @ mesh_gt.vertices.T + to_align[:3, 3:]).T mesh_rec.vertices = (to_align[:3, :3] @ mesh_rec.vertices.T + to_align[:3, 3:]).T min_points = mesh_gt.vertices.min(axis=0) max_points = mesh_gt.vertices.max(axis=0) mask_min = (mesh_rec.vertices - min_points[None]) > 0 mask_max = (mesh_rec.vertices - max_points[None]) < 0 mask = np.concatenate((mask_min, mask_max), axis=1).all(axis=1) face_mask = mask[mesh_rec.faces].all(axis=1) mesh_rec.update_vertices(mask) mesh_rec.update_faces(face_mask) mesh_rec.vertices = (to_align[:3, :3].T @ mesh_rec.vertices.T - to_align[:3, :3].T @ to_align[:3, 3:]).T mesh_gt.vertices = (to_align[:3, :3].T @ mesh_gt.vertices.T - to_align[:3, :3].T @ to_align[:3, 3:]).T # save mesh_rec and mesh_rec in args.out_path mesh_rec.export(args.out_path) # downsample mesh_gt idx = np.random.choice(np.arange(len(mesh_gt.vertices)), 5000000) mesh_gt.vertices = mesh_gt.vertices[idx] mesh_gt.colors = mesh_gt.colors[idx] mesh_gt.export(args.gt_path.replace('.ply', '_trans.ply')) return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "--gt_path", type=str, default='/your/path//Barn_GT.ply', help="path to a dataset/scene directory containing X.json, X.ply, ...", ) parser.add_argument( "--align_path", type=str, default='/your/path//Barn_trans.txt', help="path to a dataset/scene directory containing X.json, X.ply, ...", ) parser.add_argument( "--ply_path", type=str, default='/your/path//Barn_lowres.ply', help="path to reconstruction ply file", ) parser.add_argument( "--scene", type=str, default='Barn', help="path to reconstruction ply file", ) parser.add_argument( "--out_path", type=str, default='/your/path//Barn_lowres_crop.ply', help= "output directory, default: an evaluation directory is created in the directory of the ply file", ) args = parser.parse_args() main(args) ================================================ FILE: evaluation/eval_dtu/eval.py ================================================ # adapted from https://github.com/jzhangbs/DTUeval-python import numpy as np import open3d as o3d import sklearn.neighbors as skln from tqdm import tqdm from scipy.io import loadmat import multiprocessing as mp import argparse def sample_single_tri(input_): n1, n2, v1, v2, tri_vert = input_ c = np.mgrid[:n1+1, :n2+1] c += 0.5 c[0] /= max(n1, 1e-7) c[1] /= max(n2, 1e-7) c = np.transpose(c, (1,2,0)) k = c[c.sum(axis=-1) < 1] # m2 q = v1 * k[:,:1] + v2 * k[:,1:] + tri_vert return q def write_vis_pcd(file, points, colors): pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(points) pcd.colors = o3d.utility.Vector3dVector(colors) o3d.io.write_point_cloud(file, pcd) if __name__ == '__main__': mp.freeze_support() parser = argparse.ArgumentParser() parser.add_argument('--data', type=str, default='data_in.ply') parser.add_argument('--scan', type=int, default=1) parser.add_argument('--mode', type=str, default='mesh', choices=['mesh', 'pcd']) parser.add_argument('--dataset_dir', type=str, default='.') parser.add_argument('--vis_out_dir', type=str, default='.') parser.add_argument('--downsample_density', type=float, default=0.2) parser.add_argument('--patch_size', type=float, default=60) parser.add_argument('--max_dist', type=float, default=20) parser.add_argument('--visualize_threshold', type=float, default=10) args = parser.parse_args() thresh = args.downsample_density if args.mode == 'mesh': pbar = tqdm(total=9) pbar.set_description('read data mesh') data_mesh = o3d.io.read_triangle_mesh(args.data) vertices = np.asarray(data_mesh.vertices) triangles = np.asarray(data_mesh.triangles) tri_vert = vertices[triangles] pbar.update(1) pbar.set_description('sample pcd from mesh') v1 = tri_vert[:,1] - tri_vert[:,0] v2 = tri_vert[:,2] - tri_vert[:,0] l1 = np.linalg.norm(v1, axis=-1, keepdims=True) l2 = np.linalg.norm(v2, axis=-1, keepdims=True) area2 = np.linalg.norm(np.cross(v1, v2), axis=-1, keepdims=True) non_zero_area = (area2 > 0)[:,0] l1, l2, area2, v1, v2, tri_vert = [ arr[non_zero_area] for arr in [l1, l2, area2, v1, v2, tri_vert] ] thr = thresh * np.sqrt(l1 * l2 / area2) n1 = np.floor(l1 / thr) n2 = np.floor(l2 / thr) with mp.Pool() as mp_pool: new_pts = mp_pool.map(sample_single_tri, ((n1[i,0], n2[i,0], v1[i:i+1], v2[i:i+1], tri_vert[i:i+1,0]) for i in range(len(n1))), chunksize=1024) new_pts = np.concatenate(new_pts, axis=0) data_pcd = np.concatenate([vertices, new_pts], axis=0) elif args.mode == 'pcd': pbar = tqdm(total=8) pbar.set_description('read data pcd') data_pcd_o3d = o3d.io.read_point_cloud(args.data) data_pcd = np.asarray(data_pcd_o3d.points) pbar.update(1) pbar.set_description('random shuffle pcd index') shuffle_rng = np.random.default_rng() shuffle_rng.shuffle(data_pcd, axis=0) pbar.update(1) pbar.set_description('downsample pcd') nn_engine = skln.NearestNeighbors(n_neighbors=1, radius=thresh, algorithm='kd_tree', n_jobs=-1) nn_engine.fit(data_pcd) rnn_idxs = nn_engine.radius_neighbors(data_pcd, radius=thresh, return_distance=False) mask = np.ones(data_pcd.shape[0], dtype=np.bool_) for curr, idxs in enumerate(rnn_idxs): if mask[curr]: mask[idxs] = 0 mask[curr] = 1 data_down = data_pcd[mask] pbar.update(1) pbar.set_description('masking data pcd') obs_mask_file = loadmat(f'{args.dataset_dir}/ObsMask/ObsMask{args.scan}_10.mat') ObsMask, BB, Res = [obs_mask_file[attr] for attr in ['ObsMask', 'BB', 'Res']] BB = BB.astype(np.float32) patch = args.patch_size inbound = ((data_down >= BB[:1]-patch) & (data_down < BB[1:]+patch*2)).sum(axis=-1) ==3 data_in = data_down[inbound] data_grid = np.around((data_in - BB[:1]) / Res).astype(np.int32) grid_inbound = ((data_grid >= 0) & (data_grid < np.expand_dims(ObsMask.shape, 0))).sum(axis=-1) ==3 data_grid_in = data_grid[grid_inbound] in_obs = ObsMask[data_grid_in[:,0], data_grid_in[:,1], data_grid_in[:,2]].astype(np.bool_) data_in_obs = data_in[grid_inbound][in_obs] pbar.update(1) pbar.set_description('read STL pcd') stl_pcd = o3d.io.read_point_cloud(f'{args.dataset_dir}/Points/stl/stl{args.scan:03}_total.ply') stl = np.asarray(stl_pcd.points) pbar.update(1) pbar.set_description('compute data2stl') nn_engine.fit(stl) dist_d2s, idx_d2s = nn_engine.kneighbors(data_in_obs, n_neighbors=1, return_distance=True) max_dist = args.max_dist mean_d2s = dist_d2s[dist_d2s < max_dist].mean() pbar.update(1) pbar.set_description('compute stl2data') ground_plane = loadmat(f'{args.dataset_dir}/ObsMask/Plane{args.scan}.mat')['P'] stl_hom = np.concatenate([stl, np.ones_like(stl[:,:1])], -1) above = (ground_plane.reshape((1,4)) * stl_hom).sum(-1) > 0 stl_above = stl[above] nn_engine.fit(data_in) dist_s2d, idx_s2d = nn_engine.kneighbors(stl_above, n_neighbors=1, return_distance=True) mean_s2d = dist_s2d[dist_s2d < max_dist].mean() pbar.update(1) pbar.set_description('visualize error') vis_dist = args.visualize_threshold R = np.array([[1,0,0]], dtype=np.float64) G = np.array([[0,1,0]], dtype=np.float64) B = np.array([[0,0,1]], dtype=np.float64) W = np.array([[1,1,1]], dtype=np.float64) data_color = np.tile(B, (data_down.shape[0], 1)) data_alpha = dist_d2s.clip(max=vis_dist) / vis_dist data_color[ np.where(inbound)[0][grid_inbound][in_obs] ] = R * data_alpha + W * (1-data_alpha) data_color[ np.where(inbound)[0][grid_inbound][in_obs][dist_d2s[:,0] >= max_dist] ] = G write_vis_pcd(f'{args.vis_out_dir}/vis_{args.scan:03}_d2s.ply', data_down, data_color) stl_color = np.tile(B, (stl.shape[0], 1)) stl_alpha = dist_s2d.clip(max=vis_dist) / vis_dist stl_color[ np.where(above)[0] ] = R * stl_alpha + W * (1-stl_alpha) stl_color[ np.where(above)[0][dist_s2d[:,0] >= max_dist] ] = G write_vis_pcd(f'{args.vis_out_dir}/vis_{args.scan:03}_s2d.ply', stl, stl_color) pbar.update(1) pbar.set_description('done') pbar.close() over_all = (mean_d2s + mean_s2d) / 2 print(mean_d2s, mean_s2d, over_all) import json with open(f'{args.vis_out_dir}/results.json', 'w') as fp: json.dump({ 'mean_d2s': mean_d2s, 'mean_s2d': mean_s2d, 'overall': over_all, }, fp, indent=True) ================================================ FILE: evaluation/eval_dtu/evaluate_single_scene.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import cv2 import numpy as np import os import glob from skimage.morphology import binary_dilation, disk import argparse import trimesh from pathlib import Path from tqdm import tqdm import sys sys.path.append(os.getcwd()) import evaluation.eval_dtu.render_utils as rend_util def cull_scan(scan, mesh_path, result_mesh_file, instance_dir): # load poses image_dir = '{0}/images'.format(instance_dir) image_paths = sorted(glob.glob(os.path.join(image_dir, "*.png"))) n_images = len(image_paths) cam_file = '{0}/cameras.npz'.format(instance_dir) camera_dict = np.load(cam_file) scale_mats = [camera_dict['scale_mat_%d' % idx].astype(np.float32) for idx in range(n_images)] world_mats = [camera_dict['world_mat_%d' % idx].astype(np.float32) for idx in range(n_images)] intrinsics_all = [] pose_all = [] for scale_mat, world_mat in zip(scale_mats, world_mats): P = world_mat @ scale_mat P = P[:3, :4] intrinsics, pose = rend_util.load_K_Rt_from_P(None, P) intrinsics_all.append(torch.from_numpy(intrinsics).float()) pose_all.append(torch.from_numpy(pose).float()) # load mask mask_dir = '{0}/mask'.format(instance_dir) mask_paths = sorted(glob.glob(os.path.join(mask_dir, "*.png"))) masks = [] for p in mask_paths: mask = cv2.imread(p) masks.append(mask) # hard-coded image shape W, H = 1600, 1200 # load mesh mesh = trimesh.load(mesh_path) # load transformation matrix vertices = mesh.vertices # project and filter vertices = torch.from_numpy(vertices).cuda() vertices = torch.cat((vertices, torch.ones_like(vertices[:, :1])), dim=-1) vertices = vertices.permute(1, 0) vertices = vertices.float() sampled_masks = [] for i in tqdm(range(n_images), desc="Culling mesh given masks"): pose = pose_all[i] w2c = torch.inverse(pose).cuda() intrinsic = intrinsics_all[i].cuda() with torch.no_grad(): # transform and project cam_points = intrinsic @ w2c @ vertices pix_coords = cam_points[:2, :] / (cam_points[2, :].unsqueeze(0) + 1e-6) pix_coords = pix_coords.permute(1, 0) pix_coords[..., 0] /= W - 1 pix_coords[..., 1] /= H - 1 pix_coords = (pix_coords - 0.5) * 2 valid = ((pix_coords > -1. ) & (pix_coords < 1.)).all(dim=-1).float() # dialate mask similar to unisurf maski = masks[i][:, :, 0].astype(np.float32) / 256. maski = torch.from_numpy(binary_dilation(maski, disk(24))).float()[None, None].cuda() sampled_mask = F.grid_sample(maski, pix_coords[None, None], mode='nearest', padding_mode='zeros', align_corners=True)[0, -1, 0] sampled_mask = sampled_mask + (1. - valid) sampled_masks.append(sampled_mask) sampled_masks = torch.stack(sampled_masks, -1) # filter mask = (sampled_masks > 0.).all(dim=-1).cpu().numpy() face_mask = mask[mesh.faces].all(axis=1) mesh.update_vertices(mask) mesh.update_faces(face_mask) # transform vertices to world scale_mat = scale_mats[0] mesh.vertices = mesh.vertices * scale_mat[0, 0] + scale_mat[:3, 3][None] # Taking the biggest connected component print("Taking the biggest connected component") components = mesh.split(only_watertight=False) areas = np.array([c.area for c in components], dtype=np.float32) mesh = components[areas.argmax()] mesh.export(result_mesh_file) del mesh if __name__ == "__main__": parser = argparse.ArgumentParser( description='Arguments to evaluate the mesh.' ) parser.add_argument('--input_mesh', type=str, help='path to the mesh to be evaluated') parser.add_argument('--scan_id', type=str, help='scan id of the input mesh') parser.add_argument('--output_dir', type=str, default='evaluation_results_single', help='path to the output folder') parser.add_argument('--mask_dir', type=str, default='mask', help='path to uncropped mask') parser.add_argument('--DTU', type=str, default='Offical_DTU_Dataset', help='path to the GT DTU point clouds') args = parser.parse_args() Offical_DTU_Dataset = args.DTU out_dir = args.output_dir Path(out_dir).mkdir(parents=True, exist_ok=True) scan = args.scan_id ply_file = args.input_mesh print("cull mesh ....") result_mesh_file = os.path.join(out_dir, "culled_mesh.ply") cull_scan(scan, ply_file, result_mesh_file, instance_dir=os.path.join(args.mask_dir, f'scan{args.scan_id}')) script_dir = os.path.dirname(os.path.abspath(__file__)) cmd = f"python {script_dir}/eval.py --data {result_mesh_file} --scan {scan} --mode mesh --dataset_dir {Offical_DTU_Dataset} --vis_out_dir {out_dir}" os.system(cmd) ================================================ FILE: evaluation/eval_dtu/render_utils.py ================================================ import numpy as np import imageio import skimage import cv2 import torch from torch.nn import functional as F def get_psnr(img1, img2, normalize_rgb=False): if normalize_rgb: # [-1,1] --> [0,1] img1 = (img1 + 1.) / 2. img2 = (img2 + 1. ) / 2. mse = torch.mean((img1 - img2) ** 2) psnr = -10. * torch.log(mse) / torch.log(torch.Tensor([10.]).cuda()) return psnr def load_rgb(path, normalize_rgb = False): img = imageio.imread(path) img = skimage.img_as_float32(img) if normalize_rgb: # [-1,1] --> [0,1] img -= 0.5 img *= 2. img = img.transpose(2, 0, 1) return img def load_K_Rt_from_P(filename, P=None): if P is None: lines = open(filename).read().splitlines() if len(lines) == 4: lines = lines[1:] lines = [[x[0], x[1], x[2], x[3]] for x in (x.split(" ") for x in lines)] P = np.asarray(lines).astype(np.float32).squeeze() out = cv2.decomposeProjectionMatrix(P) K = out[0] R = out[1] t = out[2] K = K/K[2,2] intrinsics = np.eye(4) intrinsics[:3, :3] = K pose = np.eye(4, dtype=np.float32) pose[:3, :3] = R.transpose() pose[:3,3] = (t[:3] / t[3])[:,0] return intrinsics, pose def get_camera_params(uv, pose, intrinsics): if pose.shape[1] == 7: #In case of quaternion vector representation cam_loc = pose[:, 4:] R = quat_to_rot(pose[:,:4]) p = torch.eye(4).repeat(pose.shape[0],1,1).cuda().float() p[:, :3, :3] = R p[:, :3, 3] = cam_loc else: # In case of pose matrix representation cam_loc = pose[:, :3, 3] p = pose batch_size, num_samples, _ = uv.shape depth = torch.ones((batch_size, num_samples)).cuda() x_cam = uv[:, :, 0].view(batch_size, -1) y_cam = uv[:, :, 1].view(batch_size, -1) z_cam = depth.view(batch_size, -1) pixel_points_cam = lift(x_cam, y_cam, z_cam, intrinsics=intrinsics) # permute for batch matrix product pixel_points_cam = pixel_points_cam.permute(0, 2, 1) world_coords = torch.bmm(p, pixel_points_cam).permute(0, 2, 1)[:, :, :3] ray_dirs = world_coords - cam_loc[:, None, :] ray_dirs = F.normalize(ray_dirs, dim=2) return ray_dirs, cam_loc def get_camera_for_plot(pose): if pose.shape[1] == 7: #In case of quaternion vector representation cam_loc = pose[:, 4:].detach() R = quat_to_rot(pose[:,:4].detach()) else: # In case of pose matrix representation cam_loc = pose[:, :3, 3] R = pose[:, :3, :3] cam_dir = R[:, :3, 2] return cam_loc, cam_dir def lift(x, y, z, intrinsics): # parse intrinsics intrinsics = intrinsics.cuda() fx = intrinsics[:, 0, 0] fy = intrinsics[:, 1, 1] cx = intrinsics[:, 0, 2] cy = intrinsics[:, 1, 2] sk = intrinsics[:, 0, 1] x_lift = (x - cx.unsqueeze(-1) + cy.unsqueeze(-1)*sk.unsqueeze(-1)/fy.unsqueeze(-1) - sk.unsqueeze(-1)*y/fy.unsqueeze(-1)) / fx.unsqueeze(-1) * z y_lift = (y - cy.unsqueeze(-1)) / fy.unsqueeze(-1) * z # homogeneous return torch.stack((x_lift, y_lift, z, torch.ones_like(z).cuda()), dim=-1) def quat_to_rot(q): batch_size, _ = q.shape q = F.normalize(q, dim=1) R = torch.ones((batch_size, 3,3)).cuda() qr=q[:,0] qi = q[:, 1] qj = q[:, 2] qk = q[:, 3] R[:, 0, 0]=1-2 * (qj**2 + qk**2) R[:, 0, 1] = 2 * (qj *qi -qk*qr) R[:, 0, 2] = 2 * (qi * qk + qr * qj) R[:, 1, 0] = 2 * (qj * qi + qk * qr) R[:, 1, 1] = 1-2 * (qi**2 + qk**2) R[:, 1, 2] = 2*(qj*qk - qi*qr) R[:, 2, 0] = 2 * (qk * qi-qj * qr) R[:, 2, 1] = 2 * (qj*qk + qi*qr) R[:, 2, 2] = 1-2 * (qi**2 + qj**2) return R def rot_to_quat(R): batch_size, _,_ = R.shape q = torch.ones((batch_size, 4)).cuda() R00 = R[:, 0,0] R01 = R[:, 0, 1] R02 = R[:, 0, 2] R10 = R[:, 1, 0] R11 = R[:, 1, 1] R12 = R[:, 1, 2] R20 = R[:, 2, 0] R21 = R[:, 2, 1] R22 = R[:, 2, 2] q[:,0]=torch.sqrt(1.0+R00+R11+R22)/2 q[:, 1]=(R21-R12)/(4*q[:,0]) q[:, 2] = (R02 - R20) / (4 * q[:, 0]) q[:, 3] = (R10 - R01) / (4 * q[:, 0]) return q def get_sphere_intersections(cam_loc, ray_directions, r = 1.0): # Input: n_rays x 3 ; n_rays x 3 # Output: n_rays x 1, n_rays x 1 (close and far) ray_cam_dot = torch.bmm(ray_directions.view(-1, 1, 3), cam_loc.view(-1, 3, 1)).squeeze(-1) under_sqrt = ray_cam_dot ** 2 - (cam_loc.norm(2, 1, keepdim=True) ** 2 - r ** 2) # sanity check if (under_sqrt <= 0).sum() > 0: print('BOUNDING SPHERE PROBLEM!') exit() sphere_intersections = torch.sqrt(under_sqrt) * torch.Tensor([-1, 1]).cuda().float() - ray_cam_dot sphere_intersections = sphere_intersections.clamp_min(0.0) return sphere_intersections ================================================ FILE: evaluation/eval_tnt.py ================================================ import os import trimesh import argparse import numpy as np import open3d as o3d from sklearn.neighbors import KDTree def nn_correspondance(verts1, verts2): indices = [] distances = [] if len(verts1) == 0 or len(verts2) == 0: return indices, distances kdtree = KDTree(verts1) distances, indices = kdtree.query(verts2) distances = distances.reshape(-1) return distances def evaluate(mesh_pred, mesh_trgt, threshold=.05, down_sample=.02): pcd_trgt = o3d.geometry.PointCloud() pcd_pred = o3d.geometry.PointCloud() pcd_trgt.points = o3d.utility.Vector3dVector(mesh_trgt.vertices[:, :3]) pcd_pred.points = o3d.utility.Vector3dVector(mesh_pred.vertices[:, :3]) if down_sample: pcd_pred = pcd_pred.voxel_down_sample(down_sample) pcd_trgt = pcd_trgt.voxel_down_sample(down_sample) verts_pred = np.asarray(pcd_pred.points) verts_trgt = np.asarray(pcd_trgt.points) dist1 = nn_correspondance(verts_pred, verts_trgt) dist2 = nn_correspondance(verts_trgt, verts_pred) precision = np.mean((dist2 < threshold).astype('float')) recal = np.mean((dist1 < threshold).astype('float')) fscore = 2 * precision * recal / (precision + recal) metrics = { 'Acc': np.mean(dist2), 'Comp': np.mean(dist1), 'Prec': precision, 'Recal': recal, 'F-score': fscore, } return metrics def main(args): assert os.path.exists(args.ply_path), f"PLY file {args.ply_path} does not exist." mesh_rec = trimesh.load(args.ply_path, process=False) mesh_gt = trimesh.load(args.gt_path, process=False) to_align, _ = trimesh.bounds.oriented_bounds(mesh_gt) mesh_gt.vertices = (to_align[:3, :3] @ mesh_gt.vertices.T + to_align[:3, 3:]).T mesh_rec.vertices = (to_align[:3, :3] @ mesh_rec.vertices.T + to_align[:3, 3:]).T min_points = mesh_gt.vertices.min(axis=0) max_points = mesh_gt.vertices.max(axis=0) mask_min = (mesh_rec.vertices - min_points[None]) > 0 mask_max = (mesh_rec.vertices - max_points[None]) < 0 mask = np.concatenate((mask_min, mask_max), axis=1).all(axis=1) face_mask = mask[mesh_rec.faces].all(axis=1) mesh_rec.update_vertices(mask) mesh_rec.update_faces(face_mask) metrics = evaluate(mesh_rec, mesh_gt) metrics_path = os.path.join(os.path.dirname(args.ply_path), 'metrics.txt') with open(metrics_path, 'w') as f: for k, v in metrics.items(): f.write(f'{k}: {v}\n') print('Scene: {} F-score: {}'.format(args.scene, metrics['F-score'])) mesh_rec.vertices = (to_align[:3, :3].T @ mesh_rec.vertices.T - to_align[:3, :3].T @ to_align[:3, 3:]).T mesh_rec.export(args.ply_path.replace('.ply', '_crop.ply')) return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "--gt_path", type=str, default='/your/path//Barn_GT.ply', help="path to a dataset/scene directory containing X.json, X.ply, ...", ) parser.add_argument( "--ply_path", type=str, default='/your/path//Barn_lowres.ply', help="path to reconstruction ply file", ) parser.add_argument( "--scene", type=str, default='Barn', help="path to reconstruction ply file", ) args = parser.parse_args() main(args) ================================================ FILE: evaluation/full_eval.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os from argparse import ArgumentParser mipnerf360_outdoor_scenes = ["bicycle", "flowers", "garden", "stump", "treehill"] mipnerf360_indoor_scenes = ["room", "counter", "kitchen", "bonsai"] tanks_and_temples_scenes = ["truck", "train"] deep_blending_scenes = ["drjohnson", "playroom"] parser = ArgumentParser(description="Full evaluation script parameters") parser.add_argument("--skip_training", action="store_true") parser.add_argument("--skip_rendering", action="store_true") parser.add_argument("--skip_metrics", action="store_true") parser.add_argument("--output_path", default="./eval") args, _ = parser.parse_known_args() all_scenes = [] all_scenes.extend(mipnerf360_outdoor_scenes) all_scenes.extend(mipnerf360_indoor_scenes) all_scenes.extend(tanks_and_temples_scenes) all_scenes.extend(deep_blending_scenes) if not args.skip_training or not args.skip_rendering: parser.add_argument('--mipnerf360', "-m360", required=True, type=str) parser.add_argument("--tanksandtemples", "-tat", required=True, type=str) parser.add_argument("--deepblending", "-db", required=True, type=str) args = parser.parse_args() if not args.skip_training: common_args = " --quiet --eval --test_iterations -1 " for scene in mipnerf360_outdoor_scenes: source = args.mipnerf360 + "/" + scene os.system("python train.py -s " + source + " -i images_4 -m " + args.output_path + "/" + scene + common_args) for scene in mipnerf360_indoor_scenes: source = args.mipnerf360 + "/" + scene os.system("python train.py -s " + source + " -i images_2 -m " + args.output_path + "/" + scene + common_args) for scene in tanks_and_temples_scenes: source = args.tanksandtemples + "/" + scene os.system("python train.py -s " + source + " -m " + args.output_path + "/" + scene + common_args) for scene in deep_blending_scenes: source = args.deepblending + "/" + scene os.system("python train.py -s " + source + " -m " + args.output_path + "/" + scene + common_args) if not args.skip_rendering: all_sources = [] for scene in mipnerf360_outdoor_scenes: all_sources.append(args.mipnerf360 + "/" + scene) for scene in mipnerf360_indoor_scenes: all_sources.append(args.mipnerf360 + "/" + scene) for scene in tanks_and_temples_scenes: all_sources.append(args.tanksandtemples + "/" + scene) for scene in deep_blending_scenes: all_sources.append(args.deepblending + "/" + scene) common_args = " --quiet --eval --skip_train" for scene, source in zip(all_scenes, all_sources): os.system("python render.py --iteration 7000 -s " + source + " -m " + args.output_path + "/" + scene + common_args) os.system("python render.py --iteration 30000 -s " + source + " -m " + args.output_path + "/" + scene + common_args) if not args.skip_metrics: scenes_string = "" for scene in all_scenes: scenes_string += "\"" + args.output_path + "/" + scene + "\" " os.system("python metrics.py -m " + scenes_string) ================================================ FILE: evaluation/lpipsPyTorch/__init__.py ================================================ import torch from .modules.lpips import LPIPS def lpips(x: torch.Tensor, y: torch.Tensor, net_type: str = 'alex', version: str = '0.1'): r"""Function that measures Learned Perceptual Image Patch Similarity (LPIPS). Arguments: x, y (torch.Tensor): the input tensors to compare. net_type (str): the network type to compare the features: 'alex' | 'squeeze' | 'vgg'. Default: 'alex'. version (str): the version of LPIPS. Default: 0.1. """ device = x.device criterion = LPIPS(net_type, version).to(device) return criterion(x, y) ================================================ FILE: evaluation/lpipsPyTorch/modules/lpips.py ================================================ import torch import torch.nn as nn from .networks import get_network, LinLayers from .utils import get_state_dict class LPIPS(nn.Module): r"""Creates a criterion that measures Learned Perceptual Image Patch Similarity (LPIPS). Arguments: net_type (str): the network type to compare the features: 'alex' | 'squeeze' | 'vgg'. Default: 'alex'. version (str): the version of LPIPS. Default: 0.1. """ def __init__(self, net_type: str = 'alex', version: str = '0.1'): assert version in ['0.1'], 'v0.1 is only supported now' super(LPIPS, self).__init__() # pretrained network self.net = get_network(net_type) # linear layers self.lin = LinLayers(self.net.n_channels_list) self.lin.load_state_dict(get_state_dict(net_type, version)) def forward(self, x: torch.Tensor, y: torch.Tensor): feat_x, feat_y = self.net(x), self.net(y) diff = [(fx - fy) ** 2 for fx, fy in zip(feat_x, feat_y)] res = [l(d).mean((2, 3), True) for d, l in zip(diff, self.lin)] return torch.sum(torch.cat(res, 0), 0, True) ================================================ FILE: evaluation/lpipsPyTorch/modules/networks.py ================================================ from typing import Sequence from itertools import chain import torch import torch.nn as nn from torchvision import models from .utils import normalize_activation def get_network(net_type: str): if net_type == 'alex': return AlexNet() elif net_type == 'squeeze': return SqueezeNet() elif net_type == 'vgg': return VGG16() else: raise NotImplementedError('choose net_type from [alex, squeeze, vgg].') class LinLayers(nn.ModuleList): def __init__(self, n_channels_list: Sequence[int]): super(LinLayers, self).__init__([ nn.Sequential( nn.Identity(), nn.Conv2d(nc, 1, 1, 1, 0, bias=False) ) for nc in n_channels_list ]) for param in self.parameters(): param.requires_grad = False class BaseNet(nn.Module): def __init__(self): super(BaseNet, self).__init__() # register buffer self.register_buffer( 'mean', torch.Tensor([-.030, -.088, -.188])[None, :, None, None]) self.register_buffer( 'std', torch.Tensor([.458, .448, .450])[None, :, None, None]) def set_requires_grad(self, state: bool): for param in chain(self.parameters(), self.buffers()): param.requires_grad = state def z_score(self, x: torch.Tensor): return (x - self.mean) / self.std def forward(self, x: torch.Tensor): x = self.z_score(x) output = [] for i, (_, layer) in enumerate(self.layers._modules.items(), 1): x = layer(x) if i in self.target_layers: output.append(normalize_activation(x)) if len(output) == len(self.target_layers): break return output class SqueezeNet(BaseNet): def __init__(self): super(SqueezeNet, self).__init__() self.layers = models.squeezenet1_1(True).features self.target_layers = [2, 5, 8, 10, 11, 12, 13] self.n_channels_list = [64, 128, 256, 384, 384, 512, 512] self.set_requires_grad(False) class AlexNet(BaseNet): def __init__(self): super(AlexNet, self).__init__() self.layers = models.alexnet(True).features self.target_layers = [2, 5, 8, 10, 12] self.n_channels_list = [64, 192, 384, 256, 256] self.set_requires_grad(False) class VGG16(BaseNet): def __init__(self): super(VGG16, self).__init__() self.layers = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features self.target_layers = [4, 9, 16, 23, 30] self.n_channels_list = [64, 128, 256, 512, 512] self.set_requires_grad(False) ================================================ FILE: evaluation/lpipsPyTorch/modules/utils.py ================================================ from collections import OrderedDict import torch def normalize_activation(x, eps=1e-10): norm_factor = torch.sqrt(torch.sum(x ** 2, dim=1, keepdim=True)) return x / (norm_factor + eps) def get_state_dict(net_type: str = 'alex', version: str = '0.1'): # build url url = 'https://raw.githubusercontent.com/richzhang/PerceptualSimilarity/' \ + f'master/lpips/weights/v{version}/{net_type}.pth' # download old_state_dict = torch.hub.load_state_dict_from_url( url, progress=True, map_location=None if torch.cuda.is_available() else torch.device('cpu') ) # rename keys new_state_dict = OrderedDict() for key, val in old_state_dict.items(): new_key = key new_key = new_key.replace('lin', '') new_key = new_key.replace('model.', '') new_state_dict[new_key] = val return new_state_dict ================================================ FILE: evaluation/metrics.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os import sys import json import torch from PIL import Image from tqdm import tqdm from pathlib import Path import torchvision.transforms.functional as tf sys.path.append(os.getcwd()) from tools.loss_utils import ssim from lpipsPyTorch import lpips from tools.image_utils import psnr from argparse import ArgumentParser from configs.config import Config from tools.general_utils import set_random_seed def readImages(renders_dir, gt_dir): renders = [] gts = [] image_names = [] for fname in os.listdir(renders_dir): render = Image.open(renders_dir / fname) gt = Image.open(gt_dir / fname) renders.append(tf.to_tensor(render).unsqueeze(0)[:, :3, :, :].cuda()) gts.append(tf.to_tensor(gt).unsqueeze(0)[:, :3, :, :].cuda()) image_names.append(fname) return renders, gts, image_names def evaluate(model_paths): full_dict = {} per_view_dict = {} full_dict_polytopeonly = {} per_view_dict_polytopeonly = {} print("") for scene_dir in model_paths: try: print("Scene:", scene_dir) full_dict[scene_dir] = {} per_view_dict[scene_dir] = {} full_dict_polytopeonly[scene_dir] = {} per_view_dict_polytopeonly[scene_dir] = {} test_dir = Path(scene_dir) / "test" for method in os.listdir(test_dir): print("Method:", method) full_dict[scene_dir][method] = {} per_view_dict[scene_dir][method] = {} full_dict_polytopeonly[scene_dir][method] = {} per_view_dict_polytopeonly[scene_dir][method] = {} method_dir = test_dir / method gt_dir = method_dir/ "gt" renders_dir = method_dir / "renders" renders, gts, image_names = readImages(renders_dir, gt_dir) ssims = [] psnrs = [] lpipss = [] for idx in tqdm(range(len(renders)), desc="Metric evaluation progress"): ssims.append(ssim(renders[idx], gts[idx])) psnrs.append(psnr(renders[idx], gts[idx])) lpipss.append(lpips(renders[idx], gts[idx], net_type='vgg')) full_dict[scene_dir][method].update({"SSIM": torch.tensor(ssims).mean().item(), "PSNR": torch.tensor(psnrs).mean().item(), "LPIPS": torch.tensor(lpipss).mean().item()}) per_view_dict[scene_dir][method].update({"SSIM": {name: ssim for ssim, name in zip(torch.tensor(ssims).tolist(), image_names)}, "PSNR": {name: psnr for psnr, name in zip(torch.tensor(psnrs).tolist(), image_names)}, "LPIPS": {name: lp for lp, name in zip(torch.tensor(lpipss).tolist(), image_names)}}) with open(scene_dir + "/results.json", 'w') as fp: json.dump(full_dict[scene_dir], fp, indent=True) with open(scene_dir + "/per_view.json", 'w') as fp: json.dump(per_view_dict[scene_dir], fp, indent=True) except: print("Unable to compute metrics for model", scene_dir) if __name__ == "__main__": device = torch.device("cuda:0") torch.cuda.set_device(device) # Set up command line argument parser parser = ArgumentParser(description="Training script parameters") parser.add_argument('--cfg_path', type=str, default='configs/config_base.yaml') args = parser.parse_args() cfg = Config(args.cfg_path) cfg.model.data_device = 'cpu' cfg.model.load_normal = False set_random_seed(cfg.seed) evaluate([cfg.model.model_path]) ================================================ FILE: evaluation/render.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os import sys import torch import torchvision from tqdm import tqdm from argparse import ArgumentParser sys.path.append(os.getcwd()) from scene import Scene from gaussian_renderer import render, render_fast from gaussian_renderer import GaussianModel from configs.config import Config from tools.general_utils import set_random_seed from tools.loss_utils import cos_weight def render_set(model_path, name, iteration, views, gaussians, cfg, background): render_path = os.path.join(model_path, name, "ours_{}".format(iteration), "renders") gts_path = os.path.join(model_path, name, "ours_{}".format(iteration), "gt") os.makedirs(render_path, exist_ok=True) os.makedirs(gts_path, exist_ok=True) alphas = [] for idx, view in enumerate(tqdm(views, desc="Rendering progress")): outs = render(view, gaussians, cfg, background) # outs = render_fast(view, gaussians, cfg, background) rendering = outs["render"] gt = view.original_image[0:3, :, :] torchvision.utils.save_image(rendering, os.path.join(render_path, '{0:05d}'.format(idx) + ".png")) torchvision.utils.save_image(gt, os.path.join(gts_path, '{0:05d}'.format(idx) + ".png")) alphas.append(outs["alpha"].detach().clone().view(-1).cpu()) if False: normal_map = outs["normal"].detach().clone() normal_gt = view.normal.cuda() cos = cos_weight(normal_gt, normal_map, cfg.optim.exp_t, cfg.optim.cos_thr) torchvision.utils.save_image(cos, os.path.join(render_path, '{0:05d}_cosine'.format(idx) + ".png")) # alphas = torch.cat(alphas, dim=0) # print("Alpha min: {}, max: {}".format(alphas.min(), alphas.max())) # print("Alpha mean: {}, std: {}".format(alphas.mean(), alphas.std())) # print("Alpha median: {}".format(alphas.median())) def render_sets(cfg, iteration : int, skip_train : bool, skip_test : bool): with torch.no_grad(): gaussians = GaussianModel(cfg.model) scene = Scene(cfg.model, gaussians, load_iteration=iteration, shuffle=False) # gaussians.extent = scene.cameras_extent bg_color = [1,1,1] if cfg.model.white_background else [0, 0, 0] background = torch.tensor(bg_color, dtype=torch.float32, device="cuda") if not skip_train: render_set(cfg.model.model_path, "train", scene.loaded_iter, scene.getTrainCameras(), gaussians, cfg, background) if not skip_test: render_set(cfg.model.model_path, "test", scene.loaded_iter, scene.getTestCameras(), gaussians, cfg, background) if __name__ == "__main__": # Set up command line argument parser parser = ArgumentParser() parser.add_argument('--cfg_path', type=str, default='configs/config_base.yaml') parser.add_argument("--iteration", default=-1, type=int) parser.add_argument("--skip_train", action="store_true") parser.add_argument("--skip_test", action="store_true") args = parser.parse_args() cfg = Config(args.cfg_path) cfg.model.data_device = 'cuda' cfg.model.load_normal = False cfg.model.load_mask = False set_random_seed(cfg.seed) # Initialize system state (RNG) # safe_state(args.quiet) render_sets(cfg, args.iteration, args.skip_train, args.skip_test) ================================================ FILE: evaluation/tnt_eval/README.md ================================================ # Python Toolbox for Evaluation This Python script evaluates **training** dataset of TanksAndTemples benchmark. The script requires ``Open3D`` and a few Python packages such as ``matplotlib``, ``json``, and ``numpy``. ## How to use: **Step 0**. Reconstruct 3D models and recover camera poses from the training dataset. The raw videos of the training dataset can be found from: https://tanksandtemples.org/download/ **Step 1**. Download evaluation data (ground truth geometry + reference reconstruction) using [this link](https://drive.google.com/open?id=1UoKPiUUsKa0AVHFOrnMRhc5hFngjkE-t). In this example, we regard ``TanksAndTemples/evaluation/data/`` as a dataset folder. **Step 2**. Install Open3D. Follow instructions in http://open3d.org/docs/getting_started.html **Step 3**. Run the evaluation script and grab some coffee. ``` python run.py --dataset-dir path/to/TanksAndTemples/evaluation/data/Ignatius --traj-path path/to/TanksAndTemples/evaluation/data/Ignatius/Ignatius_COLMAP_SfM.log --ply-path path/to/TanksAndTemples/evaluation/data/Ignatius/Ignatius_COLMAP.ply ``` Output (evaluation of Ignatius): ``` =========================== Evaluating Ignatius =========================== path/to/TanksAndTemples/evaluation/data/Ignatius/Ignatius_COLMAP.ply Reading PLY: [========================================] 100% Read PointCloud: 6929586 vertices. path/to/TanksAndTemples/evaluation/data/Ignatius/Ignatius.ply Reading PLY: [========================================] 100% : ICP Iteration #0: Fitness 0.9980, RMSE 0.0044 ICP Iteration #1: Fitness 0.9980, RMSE 0.0043 ICP Iteration #2: Fitness 0.9980, RMSE 0.0043 ICP Iteration #3: Fitness 0.9980, RMSE 0.0043 ICP Iteration #4: Fitness 0.9980, RMSE 0.0042 ICP Iteration #5: Fitness 0.9980, RMSE 0.0042 ICP Iteration #6: Fitness 0.9979, RMSE 0.0042 ICP Iteration #7: Fitness 0.9979, RMSE 0.0042 ICP Iteration #8: Fitness 0.9979, RMSE 0.0042 ICP Iteration #9: Fitness 0.9979, RMSE 0.0042 ICP Iteration #10: Fitness 0.9979, RMSE 0.0042 [EvaluateHisto] Cropping geometry: [========================================] 100% Pointcloud down sampled from 6929586 points to 1449840 points. Pointcloud down sampled from 1449840 points to 1365628 points. path/to/TanksAndTemples/evaluation/data/Ignatius/evaluation//Ignatius.precision.ply Cropping geometry: [========================================] 100% Pointcloud down sampled from 5016769 points to 4957123 points. Pointcloud down sampled from 4957123 points to 4181506 points. [compute_point_cloud_to_point_cloud_distance] [compute_point_cloud_to_point_cloud_distance] : [ViewDistances] Add color coding to visualize error [ViewDistances] Add color coding to visualize error : [get_f1_score_histo2] ============================== evaluation result : Ignatius ============================== distance tau : 0.003 precision : 0.7679 recall : 0.7937 f-score : 0.7806 ============================== ``` **Step 5**. Go to the evaluation folder. ``TanksAndTemples/evaluation/data/Ignatius/evaluation/`` will have the following outputs. ``PR_Ignatius_@d_th_0_0030.pdf`` (Precision and recall curves with a F-score) | | | |--|--| | ``Ignatius.precision.ply`` | ``Ignatius.recall.ply`` | (3D visualization of precision and recall. Each mesh is color coded using hot colormap) # Requirements - Python 3 - open3d v0.9.0 - matplotlib ================================================ FILE: evaluation/tnt_eval/config.py ================================================ # ---------------------------------------------------------------------------- # - TanksAndTemples Website Toolbox - # - http://www.tanksandtemples.org - # ---------------------------------------------------------------------------- # The MIT License (MIT) # # Copyright (c) 2017 # Arno Knapitsch # Jaesik Park # Qian-Yi Zhou # Vladlen Koltun # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ---------------------------------------------------------------------------- # some global parameters - do not modify scenes_tau_dict = { "Barn": 0.01, "Caterpillar": 0.005, "Church": 0.025, "Courthouse": 0.025, "Ignatius": 0.003, "Meetingroom": 0.01, "Truck": 0.005, } ================================================ FILE: evaluation/tnt_eval/evaluation.py ================================================ # ---------------------------------------------------------------------------- # - TanksAndTemples Website Toolbox - # - http://www.tanksandtemples.org - # ---------------------------------------------------------------------------- # The MIT License (MIT) # # Copyright (c) 2017 # Arno Knapitsch # Jaesik Park # Qian-Yi Zhou # Vladlen Koltun # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ---------------------------------------------------------------------------- # # This python script is for downloading dataset from www.tanksandtemples.org # The dataset has a different license, please refer to # https://tanksandtemples.org/license/ import json import copy import os import numpy as np import open3d as o3d import matplotlib.pyplot as plt def read_alignment_transformation(filename): with open(filename) as data_file: data = json.load(data_file) return np.asarray(data["transformation"]).reshape((4, 4)).transpose() def write_color_distances(path, pcd, distances, max_distance): o3d.utility.set_verbosity_level(o3d.utility.VerbosityLevel.Debug) # cmap = plt.get_cmap("afmhot") cmap = plt.get_cmap("hot_r") distances = np.array(distances) colors = cmap(np.minimum(distances, max_distance) / max_distance)[:, :3] pcd.colors = o3d.utility.Vector3dVector(colors) o3d.io.write_point_cloud(path, pcd) def EvaluateHisto( source, target, trans, crop_volume, voxel_size, threshold, filename_mvs, plot_stretch, scene_name, verbose=True, ): print("[EvaluateHisto]") o3d.utility.set_verbosity_level(o3d.utility.VerbosityLevel.Debug) s = copy.deepcopy(source) s.transform(trans) s = crop_volume.crop_point_cloud(s) s = s.voxel_down_sample(voxel_size) s.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamKNN(knn=20)) print(filename_mvs + "/" + scene_name + ".precision.ply") t = copy.deepcopy(target) t = crop_volume.crop_point_cloud(t) t = t.voxel_down_sample(voxel_size) t.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamKNN(knn=20)) print("[compute_point_cloud_to_point_cloud_distance]") distance1 = s.compute_point_cloud_distance(t) print("[compute_point_cloud_to_point_cloud_distance]") distance2 = t.compute_point_cloud_distance(s) # write the distances to bin files # np.array(distance1).astype("float64").tofile( # filename_mvs + "/" + scene_name + ".precision.bin" # ) # np.array(distance2).astype("float64").tofile( # filename_mvs + "/" + scene_name + ".recall.bin" # ) # Colorize the poincloud files prith the precision and recall values # o3d.io.write_point_cloud( # filename_mvs + "/" + scene_name + ".precision.ply", s # ) # o3d.io.write_point_cloud( # filename_mvs + "/" + scene_name + ".precision.ncb.ply", s # ) # o3d.io.write_point_cloud(filename_mvs + "/" + scene_name + ".recall.ply", t) source_n_fn = filename_mvs + "/" + scene_name + ".precision.ply" target_n_fn = filename_mvs + "/" + scene_name + ".recall.ply" print("[ViewDistances] Add color coding to visualize error") # eval_str_viewDT = ( # OPEN3D_EXPERIMENTAL_BIN_PATH # + "ViewDistances " # + source_n_fn # + " --max_distance " # + str(threshold * 3) # + " --write_color_back --without_gui" # ) # os.system(eval_str_viewDT) write_color_distances(source_n_fn, s, distance1, 3 * threshold) print("[ViewDistances] Add color coding to visualize error") # eval_str_viewDT = ( # OPEN3D_EXPERIMENTAL_BIN_PATH # + "ViewDistances " # + target_n_fn # + " --max_distance " # + str(threshold * 3) # + " --write_color_back --without_gui" # ) # os.system(eval_str_viewDT) write_color_distances(target_n_fn, t, distance2, 3 * threshold) # get histogram and f-score [ precision, recall, fscore, edges_source, cum_source, edges_target, cum_target, ] = get_f1_score_histo2(threshold, filename_mvs, plot_stretch, distance1, distance2) np.savetxt(filename_mvs + "/" + scene_name + ".recall.txt", cum_target) np.savetxt(filename_mvs + "/" + scene_name + ".precision.txt", cum_source) np.savetxt( filename_mvs + "/" + scene_name + ".prf_tau_plotstr.txt", np.array([precision, recall, fscore, threshold, plot_stretch]), ) return [ precision, recall, fscore, edges_source, cum_source, edges_target, cum_target, ] def get_f1_score_histo2(threshold, filename_mvs, plot_stretch, distance1, distance2, verbose=True): print("[get_f1_score_histo2]") dist_threshold = threshold if len(distance1) and len(distance2): recall = float(sum(d < threshold for d in distance2)) / float( len(distance2)) precision = float(sum(d < threshold for d in distance1)) / float( len(distance1)) fscore = 2 * recall * precision / (recall + precision) num = len(distance1) bins = np.arange(0, dist_threshold * plot_stretch, dist_threshold / 100) hist, edges_source = np.histogram(distance1, bins) cum_source = np.cumsum(hist).astype(float) / num num = len(distance2) bins = np.arange(0, dist_threshold * plot_stretch, dist_threshold / 100) hist, edges_target = np.histogram(distance2, bins) cum_target = np.cumsum(hist).astype(float) / num else: precision = 0 recall = 0 fscore = 0 edges_source = np.array([0]) cum_source = np.array([0]) edges_target = np.array([0]) cum_target = np.array([0]) return [ precision, recall, fscore, edges_source, cum_source, edges_target, cum_target, ] ================================================ FILE: evaluation/tnt_eval/plot.py ================================================ # ---------------------------------------------------------------------------- # - TanksAndTemples Website Toolbox - # - http://www.tanksandtemples.org - # ---------------------------------------------------------------------------- # The MIT License (MIT) # # Copyright (c) 2017 # Arno Knapitsch # Jaesik Park # Qian-Yi Zhou # Vladlen Koltun # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ---------------------------------------------------------------------------- # # This python script is for downloading dataset from www.tanksandtemples.org # The dataset has a different license, please refer to # https://tanksandtemples.org/license/ import matplotlib.pyplot as plt from cycler import cycler def plot_graph( scene, fscore, dist_threshold, edges_source, cum_source, edges_target, cum_target, plot_stretch, mvs_outpath, show_figure=False, ): f = plt.figure() plt_size = [14, 7] pfontsize = "medium" ax = plt.subplot(111) label_str = "precision" ax.plot( edges_source[1::], cum_source * 100, c="red", label=label_str, linewidth=2.0, ) label_str = "recall" ax.plot( edges_target[1::], cum_target * 100, c="blue", label=label_str, linewidth=2.0, ) ax.grid(True) plt.rcParams["figure.figsize"] = plt_size plt.rc("axes", prop_cycle=cycler("color", ["r", "g", "b", "y"])) plt.title("Precision and Recall: " + scene + ", " + "%02.2f f-score" % (fscore * 100)) plt.axvline(x=dist_threshold, c="black", ls="dashed", linewidth=2.0) plt.ylabel("# of points (%)", fontsize=15) plt.xlabel("Meters", fontsize=15) plt.axis([0, dist_threshold * plot_stretch, 0, 100]) ax.legend(shadow=True, fancybox=True, fontsize=pfontsize) # plt.axis([0, dist_threshold*plot_stretch, 0, 100]) plt.setp(ax.get_legend().get_texts(), fontsize=pfontsize) plt.legend(loc=2, borderaxespad=0.0, fontsize=pfontsize) plt.legend(loc=4) leg = plt.legend(loc="lower right") box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) plt.setp(ax.get_legend().get_texts(), fontsize=pfontsize) png_name = mvs_outpath + "/PR_{0}_@d_th_0_{1}.png".format( scene, "%04d" % (dist_threshold * 10000)) pdf_name = mvs_outpath + "/PR_{0}_@d_th_0_{1}.pdf".format( scene, "%04d" % (dist_threshold * 10000)) # save figure and display f.savefig(png_name, format="png", bbox_inches="tight") f.savefig(pdf_name, format="pdf", bbox_inches="tight") if show_figure: plt.show() ================================================ FILE: evaluation/tnt_eval/registration.py ================================================ # ---------------------------------------------------------------------------- # - TanksAndTemples Website Toolbox - # - http://www.tanksandtemples.org - # ---------------------------------------------------------------------------- # The MIT License (MIT) # # Copyright (c) 2017 # Arno Knapitsch # Jaesik Park # Qian-Yi Zhou # Vladlen Koltun # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ---------------------------------------------------------------------------- # # This python script is for downloading dataset from www.tanksandtemples.org # The dataset has a different license, please refer to # https://tanksandtemples.org/license/ from trajectory_io import read_trajectory, convert_trajectory_to_pointcloud import copy import numpy as np import open3d as o3d MAX_POINT_NUMBER = 4e6 def read_mapping(filename): mapping = [] with open(filename, "r") as f: n_sampled_frames = int(f.readline()) n_total_frames = int(f.readline()) mapping = np.zeros(shape=(n_sampled_frames, 2)) metastr = f.readline() for iter in range(n_sampled_frames): metadata = list(map(int, metastr.split())) mapping[iter, :] = metadata metastr = f.readline() return [n_sampled_frames, n_total_frames, mapping] def gen_sparse_trajectory(mapping, f_trajectory): sparse_traj = [] for m in mapping: sparse_traj.append(f_trajectory[int(m[1] - 1)]) return sparse_traj def trajectory_alignment(map_file, traj_to_register, gt_traj_col, gt_trans, scene): traj_pcd_col = convert_trajectory_to_pointcloud(gt_traj_col) traj_pcd_col.transform(gt_trans) corres = o3d.utility.Vector2iVector( np.asarray(list(map(lambda x: [x, x], range(len(gt_traj_col)))))) rr = o3d.registration.RANSACConvergenceCriteria() rr.max_iteration = 100000 rr.max_validation = 100000 # in this case a log file was used which contains # every movie frame (see tutorial for details) if len(traj_to_register) > 1600: n_sampled_frames, n_total_frames, mapping = read_mapping(map_file) traj_col2 = gen_sparse_trajectory(mapping, traj_to_register) traj_to_register_pcd = convert_trajectory_to_pointcloud(traj_col2) else: traj_to_register_pcd = convert_trajectory_to_pointcloud( traj_to_register) randomvar = 0.0 nr_of_cam_pos = len(traj_to_register_pcd.points) rand_number_added = np.asanyarray(traj_to_register_pcd.points) * ( np.random.rand(nr_of_cam_pos, 3) * randomvar - randomvar / 2.0 + 1) list_rand = list(rand_number_added) traj_to_register_pcd_rand = o3d.geometry.PointCloud() for elem in list_rand: traj_to_register_pcd_rand.points.append(elem) # Rough registration based on aligned colmap SfM data reg = o3d.registration.registration_ransac_based_on_correspondence( traj_to_register_pcd_rand, traj_pcd_col, corres, 0.2, o3d.registration.TransformationEstimationPointToPoint(True), 6, rr, ) return reg.transformation def crop_and_downsample( pcd, crop_volume, down_sample_method="voxel", voxel_size=0.01, trans=np.identity(4), ): pcd_copy = copy.deepcopy(pcd) pcd_copy.transform(trans) pcd_crop = crop_volume.crop_point_cloud(pcd_copy) if down_sample_method == "voxel": # return voxel_down_sample(pcd_crop, voxel_size) return pcd_crop.voxel_down_sample(voxel_size) elif down_sample_method == "uniform": n_points = len(pcd_crop.points) if n_points > MAX_POINT_NUMBER: ds_rate = int(round(n_points / float(MAX_POINT_NUMBER))) return pcd_crop.uniform_down_sample(ds_rate) return pcd_crop def registration_unif( source, gt_target, init_trans, crop_volume, threshold, max_itr, max_size=4 * MAX_POINT_NUMBER, verbose=True, ): if verbose: print("[Registration] threshold: %f" % threshold) o3d.utility.set_verbosity_level(o3d.utility.VerbosityLevel.Debug) s = crop_and_downsample(source, crop_volume, down_sample_method="uniform", trans=init_trans) t = crop_and_downsample(gt_target, crop_volume, down_sample_method="uniform") reg = o3d.registration.registration_icp( s, t, threshold, np.identity(4), o3d.registration.TransformationEstimationPointToPoint(True), o3d.registration.ICPConvergenceCriteria(1e-6, max_itr), ) reg.transformation = np.matmul(reg.transformation, init_trans) return reg def registration_vol_ds( source, gt_target, init_trans, crop_volume, voxel_size, threshold, max_itr, verbose=True, ): if verbose: print("[Registration] voxel_size: %f, threshold: %f" % (voxel_size, threshold)) o3d.utility.set_verbosity_level(o3d.utility.VerbosityLevel.Debug) s = crop_and_downsample( source, crop_volume, down_sample_method="voxel", voxel_size=voxel_size, trans=init_trans, ) t = crop_and_downsample( gt_target, crop_volume, down_sample_method="voxel", voxel_size=voxel_size, ) s = crop_based_target(s, t) reg = o3d.registration.registration_icp( s, t, threshold, np.identity(4), o3d.registration.TransformationEstimationPointToPoint(True), o3d.registration.ICPConvergenceCriteria(1e-6, max_itr), ) reg.transformation = np.matmul(reg.transformation, init_trans) return reg def crop_based_target(s, t): bbox_t = t.get_axis_aligned_bounding_box() min_bound = bbox_t.get_min_bound() max_bound = bbox_t.get_max_bound() s_filtered = o3d.geometry.PointCloud() valid = np.logical_and(np.all(s.points >= min_bound, axis=1), np.all(s.points <= max_bound, axis=1)) s_filtered.points = o3d.utility.Vector3dVector(np.asarray(s.points)[valid]) return s_filtered ================================================ FILE: evaluation/tnt_eval/requirements.txt ================================================ matplotlib>=1.3 open3d==0.9 ================================================ FILE: evaluation/tnt_eval/run.py ================================================ # ---------------------------------------------------------------------------- # - TanksAndTemples Website Toolbox - # - http://www.tanksandtemples.org - # ---------------------------------------------------------------------------- # The MIT License (MIT) # # Copyright (c) 2017 # Arno Knapitsch # Jaesik Park # Qian-Yi Zhou # Vladlen Koltun # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ---------------------------------------------------------------------------- # # This python script is for downloading dataset from www.tanksandtemples.org # The dataset has a different license, please refer to # https://tanksandtemples.org/license/ # this script requires Open3D python binding # please follow the intructions in setup.py before running this script. import numpy as np import open3d as o3d import os import argparse import sys sys.path.append(os.getcwd()) from config import scenes_tau_dict from registration import ( trajectory_alignment, registration_vol_ds, registration_unif, read_trajectory, ) from evaluation import EvaluateHisto from util import make_dir from plot import plot_graph def run_evaluation(dataset_dir, traj_path, ply_path, out_dir): scene = os.path.basename(os.path.normpath(dataset_dir)) if scene not in scenes_tau_dict: print(dataset_dir, scene) raise Exception("invalid dataset-dir, not in scenes_tau_dict") print("") print("===========================") print("Evaluating %s" % scene) print("===========================") dTau = scenes_tau_dict[scene] # put the crop-file, the GT file, the COLMAP SfM log file and # the alignment of the according scene in a folder of # the same scene name in the dataset_dir colmap_ref_logfile = os.path.join(dataset_dir, scene + "_COLMAP_SfM.log") alignment = os.path.join(dataset_dir, scene + "_trans.txt") gt_filen = os.path.join(dataset_dir, scene + ".ply") # gt_filen = os.path.join(dataset_dir, scene + "_GT.ply") cropfile = os.path.join(dataset_dir, scene + ".json") map_file = os.path.join(dataset_dir, scene + "_mapping_reference.txt") make_dir(out_dir) assert os.path.exists(ply_path), f"ply_path {ply_path} does not exist" # Load reconstruction and according GT print(gt_filen) gt_pcd = o3d.io.read_point_cloud(gt_filen) print(ply_path) # pcd = o3d.io.read_point_cloud(ply_path) mesh = o3d.io.read_triangle_mesh(ply_path) pcd = mesh.sample_points_uniformly(len(gt_pcd.points)) gt_trans = np.loadtxt(alignment) traj_to_register = read_trajectory(traj_path) gt_traj_col = read_trajectory(colmap_ref_logfile) trajectory_transform = trajectory_alignment(map_file, traj_to_register, gt_traj_col, gt_trans, scene) # Refine alignment by using the actual GT and MVS pointclouds vol = o3d.visualization.read_selection_polygon_volume(cropfile) # big pointclouds will be downlsampled to this number to speed up alignment dist_threshold = dTau # Registration refinment in 3 iterations r2 = registration_vol_ds(pcd, gt_pcd, trajectory_transform, vol, dTau, dTau * 80, 20) r3 = registration_vol_ds(pcd, gt_pcd, r2.transformation, vol, dTau / 2.0, dTau * 20, 20) r = registration_unif(pcd, gt_pcd, r3.transformation, vol, 2 * dTau, 20) # Histogramms and P/R/F1 plot_stretch = 5 [ precision, recall, fscore, edges_source, cum_source, edges_target, cum_target, ] = EvaluateHisto( pcd, gt_pcd, r.transformation, vol, dTau / 2.0, dTau, out_dir, plot_stretch, scene, ) eva = [precision, recall, fscore] # eva = [i*100 for i in eva] print("==============================") print("evaluation result : %s" % scene) print("==============================") print("distance tau : %.3f" % dTau) print("precision : %.4f" % eva[0]) print("recall : %.4f" % eva[1]) print("f-score : %.4f" % eva[2]) print("==============================") with open(os.path.join(out_dir, "evaluation.txt"), "w") as f: f.write("evaluation result : %s\n" % scene) f.write("distance tau : %.3f\n" % dTau) f.write("precision : %.4f\n" % eva[0]) f.write("recall : %.4f\n" % eva[1]) f.write("f-score : %.4f\n" % eva[2]) # Plotting plot_graph( scene, fscore, dist_threshold, edges_source, cum_source, edges_target, cum_target, plot_stretch, out_dir, ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--dataset-dir", type=str, required=True, help="path to a dataset/scene directory containing X.json, X.ply, ...", ) parser.add_argument( "--traj-path", type=str, required=True, help= "path to trajectory file. See `convert_to_logfile.py` to create this file.", ) parser.add_argument( "--ply-path", type=str, required=True, help="path to reconstruction ply file", ) parser.add_argument( "--out-dir", type=str, default="", help= "output directory, default: an evaluation directory is created in the directory of the ply file", ) args = parser.parse_args() if args.out_dir.strip() == "": args.out_dir = os.path.join(os.path.dirname(args.ply_path), "evaluation") run_evaluation( dataset_dir=args.dataset_dir, traj_path=args.traj_path, ply_path=args.ply_path, out_dir=args.out_dir, ) ================================================ FILE: evaluation/tnt_eval/trajectory_io.py ================================================ import numpy as np import open3d as o3d class CameraPose: def __init__(self, meta, mat): self.metadata = meta self.pose = mat def __str__(self): return ("Metadata : " + " ".join(map(str, self.metadata)) + "\n" + "Pose : " + "\n" + np.array_str(self.pose)) def convert_trajectory_to_pointcloud(traj): pcd = o3d.geometry.PointCloud() for t in traj: pcd.points.append(t.pose[:3, 3]) return pcd def read_trajectory(filename): traj = [] with open(filename, "r") as f: metastr = f.readline() while metastr: metadata = map(int, metastr.split()) mat = np.zeros(shape=(4, 4)) for i in range(4): matstr = f.readline() mat[i, :] = np.fromstring(matstr, dtype=float, sep=" \t") traj.append(CameraPose(metadata, mat)) metastr = f.readline() return traj def write_trajectory(traj, filename): with open(filename, "w") as f: for x in traj: p = x.pose.tolist() f.write(" ".join(map(str, x.metadata)) + "\n") f.write("\n".join( " ".join(map("{0:.12f}".format, p[i])) for i in range(4))) f.write("\n") ================================================ FILE: evaluation/tnt_eval/util.py ================================================ import os def make_dir(path): if not os.path.exists(path): os.makedirs(path) ================================================ FILE: gaussian_renderer/__init__.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import math import torch import torch.nn.functional as F from diff_gaussian_rasterization import GaussianRasterizationSettings, GaussianRasterizer from scene.gaussian_model import GaussianModel from tools.sh_utils import eval_sh from tools.normal_utils import compute_normals def render(viewpoint_camera, pc : GaussianModel, cfg, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None, return_normal = True, is_all = True, dirs=None, mask_depth_thr=0.8): """ Render the scene. Background tensor (bg_color) must be on GPU! """ # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0 screenspace_points_densify = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0 try: screenspace_points.retain_grad() screenspace_points_densify.retain_grad() except: pass # Set up rasterization configuration tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) raster_settings = GaussianRasterizationSettings( image_height=int(viewpoint_camera.image_height), image_width=int(viewpoint_camera.image_width), tanfovx=tanfovx, tanfovy=tanfovy, bg=bg_color, scale_modifier=scaling_modifier, viewmatrix=viewpoint_camera.world_view_transform, projmatrix=viewpoint_camera.full_proj_transform, sh_degree=pc.active_sh_degree, campos=viewpoint_camera.camera_center, prefiltered=False, debug=cfg.pipline.debug, f_count=0, ) rasterizer = GaussianRasterizer(raster_settings=raster_settings) means3D = pc.get_xyz means2D = screenspace_points means2D_densify = screenspace_points_densify opacity = pc.get_opacity # If precomputed 3d covariance is provided, use it. If not, then it will be computed from # scaling / rotation by the rasterizer. scales = None rotations = None cov3D_precomp = None if cfg.pipline.compute_cov3D_python: cov3D_precomp = pc.get_covariance(scaling_modifier) else: scales = pc.get_scaling rotations = pc.get_rotation # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. shs = None colors_precomp = None if override_color is None: if cfg.pipline.convert_SHs_python: shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2) dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1)) dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True) sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) else: shs = pc.get_features else: colors_precomp = override_color normals_precomp = None # inside, _ = pc.get_inside_gaus_normalized() if return_normal: normal = pc.get_normal(is_all=is_all) # convert normal direction to the camera; calculate the normal in the camera coordinate view_dir = means3D - viewpoint_camera.camera_center normal = normal * ((((view_dir * normal).sum(dim=-1) > 0) * 1 - 0.5) * 2)[..., None] R_w2c = torch.tensor(viewpoint_camera.R.T).cuda().to(torch.float32) normals_precomp = normal @ R_w2c.transpose(0, 1) # camera coordinate sem_feats = pc.get_objects.squeeze(1) if cfg.optim.loss_weight.semantic > 0 else None inside = None # Rasterize visible Gaussians to image, obtain their radii (on screen). rendered_out, radii = rasterizer( means3D = means3D, means2D = means2D, means2D_densify = means2D_densify, shs = shs, colors_precomp = colors_precomp, normals_precomp = normals_precomp, semantics_precomp = sem_feats, opacities = opacity, scales = scales, rotations = rotations, cov3D_precomp = cov3D_precomp, dirs = dirs, inside = inside) chs = [3, 1, 3, 1] rendered_image, rendered_depth, rendered_normal, rendered_alpha = rendered_out[:sum(chs)].split(chs, dim=0) with torch.no_grad(): mask = viewpoint_camera.mask.bool() if hasattr(viewpoint_camera, 'mask') else \ torch.ones_like(rendered_depth, dtype=torch.bool).squeeze(0) if cfg.optim.mask_depth_thr > 0: mask1 = rendered_depth < (pc.extent * cfg.optim.mask_depth_thr) mask1 = mask1.squeeze(0) mask = mask & mask1 rendered_normal = rendered_normal.permute(1, 2, 0) rendered_normal = F.normalize(rendered_normal, dim = -1) est_normal = compute_normals(rendered_depth, viewpoint_camera.intr) out = {"render": rendered_image, "depth": rendered_depth, "normal": rendered_normal, "est_normal": est_normal, "alpha": rendered_alpha, "viewspace_points": screenspace_points, "viewspace_points_densify": screenspace_points_densify, "visibility_filter" : radii > 0, "mask": mask, "radii": radii,} if cfg.optim.loss_weight.semantic > 0: rendered_sem = rendered_out[sum(chs):sum(chs)+cfg.model.ch_sem_feat] rendered_sem = pc.classifier(rendered_sem[None])[0].permute(1, 2, 0) # [H, W, cls] out.update({"render_sem": rendered_sem}) if hasattr(cfg.optim.loss_weight, 'depth_var') and cfg.optim.loss_weight.depth_var > 0: d1 = rendered_out[-2:-1] d2 = rendered_out[-1:] depth_var = d2 / rendered_alpha - (d1 / rendered_alpha) ** 2 out.update({"depth_var": depth_var}) if hasattr(cfg.optim.loss_weight, 'distortion') and cfg.optim.loss_weight.distortion > 0: rendered_dist = rendered_out[-1:] out.update({"distortion": rendered_dist}) return out def render_fast(viewpoint_camera, pc : GaussianModel, cfg, bg_color : torch.Tensor, scaling_modifier = 1.0, override_color = None): """ use the original Gaussian Splatting cuda code!!!! """ # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0 try: screenspace_points.retain_grad() except: pass # Set up rasterization configuration tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) raster_settings = GaussianRasterizationSettings( image_height=int(viewpoint_camera.image_height), image_width=int(viewpoint_camera.image_width), tanfovx=tanfovx, tanfovy=tanfovy, bg=bg_color, scale_modifier=scaling_modifier, viewmatrix=viewpoint_camera.world_view_transform, projmatrix=viewpoint_camera.full_proj_transform, sh_degree=pc.active_sh_degree, campos=viewpoint_camera.camera_center, prefiltered=False, debug=cfg.pipline.debug ) rasterizer = GaussianRasterizer(raster_settings=raster_settings) means3D = pc.get_xyz means2D = screenspace_points opacity = pc.get_opacity # If precomputed 3d covariance is provided, use it. If not, then it will be computed from # scaling / rotation by the rasterizer. scales = None rotations = None cov3D_precomp = None if cfg.pipline.compute_cov3D_python: cov3D_precomp = pc.get_covariance(scaling_modifier) else: scales = pc.get_scaling rotations = pc.get_rotation # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. shs = None colors_precomp = None if override_color is None: if cfg.pipline.convert_SHs_python: shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2) dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1)) dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True) sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) else: shs = pc.get_features else: colors_precomp = override_color # Rasterize visible Gaussians to image, obtain their radii (on screen). rendered_image, radii = rasterizer( means3D = means3D, means2D = means2D, shs = shs, colors_precomp = colors_precomp, opacities = opacity, scales = scales, rotations = rotations, cov3D_precomp = cov3D_precomp) # Those Gaussians that were frustum culled or had a radius of 0 were not visible. # They will be excluded from value updates used in the splitting criteria. return {"render": rendered_image, "viewspace_points": screenspace_points, "visibility_filter" : radii > 0, "radii": radii} def count_render( viewpoint_camera, pc: GaussianModel, pipe, bg_color: torch.Tensor, scaling_modifier=1.0, override_color=None, ): """ Render the scene. Background tensor (bg_color) must be on GPU! """ # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means screenspace_points = ( torch.zeros_like( pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda" ) + 0 ) try: screenspace_points.retain_grad() except: pass # Set up rasterization configuration tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) raster_settings = GaussianRasterizationSettings( image_height=int(viewpoint_camera.image_height), image_width=int(viewpoint_camera.image_width), tanfovx=tanfovx, tanfovy=tanfovy, bg=bg_color, scale_modifier=scaling_modifier, viewmatrix=viewpoint_camera.world_view_transform, projmatrix=viewpoint_camera.full_proj_transform, sh_degree=pc.active_sh_degree, campos=viewpoint_camera.camera_center, prefiltered=False, debug=pipe.debug, f_count=1, ) rasterizer = GaussianRasterizer(raster_settings=raster_settings) means3D = pc.get_xyz means2D = screenspace_points opacity = pc.get_opacity # If precomputed 3d covariance is provided, use it. If not, then it will be computed from # scaling / rotation by the rasterizer. scales = None rotations = None cov3D_precomp = None if pipe.compute_cov3D_python: cov3D_precomp = pc.get_covariance(scaling_modifier) else: scales = pc.get_scaling rotations = pc.get_rotation # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. shs = None colors_precomp = None if override_color is None: if pipe.convert_SHs_python: shs_view = pc.get_features.transpose(1, 2).view( -1, 3, (pc.max_sh_degree + 1) ** 2 ) dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat( pc.get_features.shape[0], 1 ) dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True) sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) else: shs = pc.get_features else: colors_precomp = override_color # Rasterize visible Gaussians to image, obtain their radii (on screen). gaussians_count, important_score, rendered_image, radii = rasterizer( means3D=means3D, means2D=means2D, means2D_densify=None, shs=shs, colors_precomp=colors_precomp, normals_precomp = None, semantics_precomp = None, opacities=opacity, scales=scales, rotations=rotations, cov3D_precomp=cov3D_precomp, ) # Those Gaussians that were frustum culled or had a radius of 0 were not visible. # They will be excluded from value updates used in the splitting criteria. return { "render": rendered_image, "viewspace_points": screenspace_points, "visibility_filter": radii > 0, "radii": radii, "gaussians_count": gaussians_count, "important_score": important_score, } def visi_render( viewpoint_camera, pc: GaussianModel, pipe, bg_color: torch.Tensor, scaling_modifier=1.0, override_color=None, ): """ Render the scene. Background tensor (bg_color) must be on GPU! """ # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means screenspace_points = ( torch.zeros_like( pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda" ) + 0 ) try: screenspace_points.retain_grad() except: pass # Set up rasterization configuration tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) raster_settings = GaussianRasterizationSettings( image_height=int(viewpoint_camera.image_height), image_width=int(viewpoint_camera.image_width), tanfovx=tanfovx, tanfovy=tanfovy, bg=bg_color, scale_modifier=scaling_modifier, viewmatrix=viewpoint_camera.world_view_transform, projmatrix=viewpoint_camera.full_proj_transform, sh_degree=pc.active_sh_degree, campos=viewpoint_camera.camera_center, prefiltered=False, debug=pipe.debug, f_count=2, ) rasterizer = GaussianRasterizer(raster_settings=raster_settings) means3D = pc.get_xyz means2D = screenspace_points opacity = pc.get_opacity # If precomputed 3d covariance is provided, use it. If not, then it will be computed from # scaling / rotation by the rasterizer. scales = None rotations = None cov3D_precomp = None if pipe.compute_cov3D_python: cov3D_precomp = pc.get_covariance(scaling_modifier) else: scales = pc.get_scaling rotations = pc.get_rotation # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. shs = None colors_precomp = None if override_color is None: if pipe.convert_SHs_python: shs_view = pc.get_features.transpose(1, 2).view( -1, 3, (pc.max_sh_degree + 1) ** 2 ) dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat( pc.get_features.shape[0], 1 ) dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True) sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) else: shs = pc.get_features else: colors_precomp = override_color # Rasterize visible Gaussians to image, obtain their radii (on screen). countlist, important_score, rendered_image, radii = rasterizer( means3D=means3D, means2D=means2D, means2D_densify=None, shs=shs, colors_precomp=colors_precomp, normals_precomp = None, semantics_precomp = None, opacities=opacity, scales=scales, rotations=rotations, cov3D_precomp=cov3D_precomp, ) # Those Gaussians that were frustum culled or had a radius of 0 were not visible. # They will be excluded from value updates used in the splitting criteria. return { "render": rendered_image, "viewspace_points": screenspace_points, "visibility_filter": radii > 0, "radii": radii, "countlist": countlist, "important_score": important_score, } def visi_acc_render( viewpoint_camera, pc: GaussianModel, pipe, bg_color: torch.Tensor, scaling_modifier=1.0, override_color=None, ): """ Render the scene. Background tensor (bg_color) must be on GPU! """ # Create zero tensor. We will use it to make pytorch return gradients of the 2D (screen-space) means screenspace_points = ( torch.zeros_like( pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda" ) + 0 ) try: screenspace_points.retain_grad() except: pass # Set up rasterization configuration tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) raster_settings = GaussianRasterizationSettings( image_height=int(viewpoint_camera.image_height), image_width=int(viewpoint_camera.image_width), tanfovx=tanfovx, tanfovy=tanfovy, bg=bg_color, scale_modifier=scaling_modifier, viewmatrix=viewpoint_camera.world_view_transform, projmatrix=viewpoint_camera.full_proj_transform, sh_degree=pc.active_sh_degree, campos=viewpoint_camera.camera_center, prefiltered=False, debug=pipe.debug, f_count=3, ) rasterizer = GaussianRasterizer(raster_settings=raster_settings) means3D = pc.get_xyz means2D = screenspace_points opacity = pc.get_opacity # If precomputed 3d covariance is provided, use it. If not, then it will be computed from # scaling / rotation by the rasterizer. scales = None rotations = None cov3D_precomp = None if pipe.compute_cov3D_python: cov3D_precomp = pc.get_covariance(scaling_modifier) else: scales = pc.get_scaling rotations = pc.get_rotation # If precomputed colors are provided, use them. Otherwise, if it is desired to precompute colors # from SHs in Python, do it. If not, then SH -> RGB conversion will be done by rasterizer. shs = None colors_precomp = None if override_color is None: if pipe.convert_SHs_python: shs_view = pc.get_features.transpose(1, 2).view( -1, 3, (pc.max_sh_degree + 1) ** 2 ) dir_pp = pc.get_xyz - viewpoint_camera.camera_center.repeat( pc.get_features.shape[0], 1 ) dir_pp_normalized = dir_pp / dir_pp.norm(dim=1, keepdim=True) sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized) colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0) else: shs = pc.get_features else: colors_precomp = override_color # Rasterize visible Gaussians to image, obtain their radii (on screen). countlist, radii = rasterizer( means3D=means3D, means2D=means2D, means2D_densify=None, shs=shs, colors_precomp=colors_precomp, normals_precomp = None, semantics_precomp = None, opacities=opacity, scales=scales, rotations=rotations, cov3D_precomp=cov3D_precomp, ) # Those Gaussians that were frustum culled or had a radius of 0 were not visible. # They will be excluded from value updates used in the splitting criteria. return { "viewspace_points": screenspace_points, "visibility_filter": radii > 0, "radii": radii, "countlist": countlist, } ================================================ FILE: gaussian_renderer/network_gui.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import torch import traceback import socket import json from scene.cameras import MiniCam host = "127.0.0.1" port = 6009 conn = None addr = None listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) def init(wish_host, wish_port): global host, port, listener host = wish_host port = wish_port listener.bind((host, port)) listener.listen() listener.settimeout(0) def try_connect(): global conn, addr, listener try: conn, addr = listener.accept() print(f"\nConnected by {addr}") conn.settimeout(None) except Exception as inst: pass def read(): global conn messageLength = conn.recv(4) messageLength = int.from_bytes(messageLength, 'little') message = conn.recv(messageLength) return json.loads(message.decode("utf-8")) def send(message_bytes, verify): global conn if message_bytes != None: conn.sendall(message_bytes) conn.sendall(len(verify).to_bytes(4, 'little')) conn.sendall(bytes(verify, 'ascii')) def receive(): message = read() width = message["resolution_x"] height = message["resolution_y"] if width != 0 and height != 0: try: do_training = bool(message["train"]) fovy = message["fov_y"] fovx = message["fov_x"] znear = message["z_near"] zfar = message["z_far"] do_shs_python = bool(message["shs_python"]) do_rot_scale_python = bool(message["rot_scale_python"]) keep_alive = bool(message["keep_alive"]) scaling_modifier = message["scaling_modifier"] world_view_transform = torch.reshape(torch.tensor(message["view_matrix"]), (4, 4)).cuda() world_view_transform[:,1] = -world_view_transform[:,1] world_view_transform[:,2] = -world_view_transform[:,2] full_proj_transform = torch.reshape(torch.tensor(message["view_projection_matrix"]), (4, 4)).cuda() full_proj_transform[:,1] = -full_proj_transform[:,1] custom_cam = MiniCam(width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform) except Exception as e: print("") traceback.print_exc() raise e return custom_cam, do_training, do_shs_python, do_rot_scale_python, keep_alive, scaling_modifier else: return None, None, None, None, None, None ================================================ FILE: process_data/convert.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os import json import logging from argparse import ArgumentParser import shutil import sys import importlib sys.path.append(os.getcwd()) def create_init_files(pinhole_dict_file, db_file, out_dir): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py # COLMAPDatabase = getattr(importlib.import_module(f'{args.colmap_path}.scripts.python.database'), 'COLMAPDatabase') from submodules.colmap.scripts.python.database import COLMAPDatabase # NOQA if not os.path.exists(out_dir): os.mkdir(out_dir) # create template with open(pinhole_dict_file) as fp: pinhole_dict = json.load(fp) template = {} cameras_line_template = '{camera_id} RADIAL {width} {height} {f} {cx} {cy} {k1} {k2}\n' images_line_template = '{image_id} {qw} {qx} {qy} {qz} {tx} {ty} {tz} {camera_id} {image_name}\n\n' for img_name in pinhole_dict: # w, h, fx, fy, cx, cy, qvec, t params = pinhole_dict[img_name] w = params[0] h = params[1] fx = params[2] # fy = params[3] cx = params[4] cy = params[5] qvec = params[6:10] tvec = params[10:13] cam_line = cameras_line_template.format( camera_id="{camera_id}", width=w, height=h, f=fx, cx=cx, cy=cy, k1=0, k2=0) img_line = images_line_template.format(image_id="{image_id}", qw=qvec[0], qx=qvec[1], qy=qvec[2], qz=qvec[3], tx=tvec[0], ty=tvec[1], tz=tvec[2], camera_id="{camera_id}", image_name=img_name) template[img_name] = (cam_line, img_line) # read database db = COLMAPDatabase.connect(db_file) table_images = db.execute("SELECT * FROM images") img_name2id_dict = {} for row in table_images: img_name2id_dict[row[1]] = row[0] cameras_txt_lines = [template[img_name][0].format(camera_id=1)] images_txt_lines = [] for img_name, img_id in img_name2id_dict.items(): image_line = template[img_name][1].format(image_id=img_id, camera_id=1) images_txt_lines.append(image_line) with open(os.path.join(out_dir, 'cameras.txt'), 'w') as fp: fp.writelines(cameras_txt_lines) with open(os.path.join(out_dir, 'images.txt'), 'w') as fp: fp.writelines(images_txt_lines) fp.write('\n') # create an empty points3D.txt fp = open(os.path.join(out_dir, 'points3D.txt'), 'w') fp.close() def main(args): colmap_command = '"{}"'.format(args.colmap_executable) if len(args.colmap_executable) > 0 else "colmap" magick_command = '"{}"'.format(args.magick_executable) if len(args.magick_executable) > 0 else "magick" use_gpu = 1 if not args.no_gpu else 0 if not args.skip_matching: os.makedirs(args.source_path + "/distorted/sparse", exist_ok=True) ## Feature extraction feat_extracton_cmd = colmap_command + " feature_extractor "\ "--database_path " + args.source_path + "/distorted/database.db \ --image_path " + args.source_path + "/input \ --ImageReader.single_camera 1 \ --ImageReader.camera_model " + args.camera + " \ --SiftExtraction.use_gpu " + str(use_gpu) exit_code = os.system(feat_extracton_cmd) if exit_code != 0: logging.error(f"Feature extraction failed with code {exit_code}. Exiting.") exit(exit_code) ## Feature matching feat_matching_cmd = colmap_command + " exhaustive_matcher \ --database_path " + args.source_path + "/distorted/database.db \ --SiftMatching.use_gpu " + str(use_gpu) exit_code = os.system(feat_matching_cmd) if exit_code != 0: logging.error(f"Feature matching failed with code {exit_code}. Exiting.") exit(exit_code) if args.existing_pose: db_file = os.path.join(args.source_path, 'distorted/database.db') sfm_dir = os.path.join(args.source_path, 'distorted/sparse/0') pinhole_dict_file = os.path.join(args.source_path, 'pinhole_dict.json') create_init_files(pinhole_dict_file, db_file, sfm_dir) ### Bundle adjustment # The default Mapper tolerance is unnecessarily large, # decreasing it speeds up bundle adjustment steps. mapper_cmd = (colmap_command + " mapper \ --database_path " + args.source_path + "/distorted/database.db \ --image_path " + args.source_path + "/input \ --output_path " + args.source_path + "/distorted/sparse \ --Mapper.ba_global_function_tolerance=0.000001") exit_code = os.system(mapper_cmd) if exit_code != 0: logging.error(f"Mapper failed with code {exit_code}. Exiting.") exit(exit_code) if not args.skip_distorting: ### Image undistortion ## We need to undistort our images into ideal pinhole intrinsics. img_undist_cmd = (colmap_command + " image_undistorter \ --image_path " + args.source_path + "/input \ --input_path " + args.source_path + "/distorted/sparse/0 \ --output_path " + args.source_path + "\ --output_type COLMAP") exit_code = os.system(img_undist_cmd) if exit_code != 0: logging.error(f"Mapper failed with code {exit_code}. Exiting.") exit(exit_code) files = os.listdir(args.source_path + "/distorted/sparse/0") os.makedirs(args.source_path + "/sparse/0", exist_ok=True) # Copy each file from the source directory to the destination directory for file in files: source_file = os.path.join(args.source_path, "distorted/sparse/0", file) destination_file = os.path.join(args.source_path, "sparse", "0", file) shutil.move(source_file, destination_file) if(args.resize): print("Copying and resizing...") # Resize images. os.makedirs(args.source_path + "/images_2", exist_ok=True) os.makedirs(args.source_path + "/images_4", exist_ok=True) os.makedirs(args.source_path + "/images_8", exist_ok=True) # Get the list of files in the source directory files = os.listdir(args.source_path + "/images") # Copy each file from the source directory to the destination directory for file in files: source_file = os.path.join(args.source_path, "images", file) destination_file = os.path.join(args.source_path, "images_2", file) shutil.copy2(source_file, destination_file) exit_code = os.system(magick_command + " mogrify -resize 50% " + destination_file) if exit_code != 0: logging.error(f"50% resize failed with code {exit_code}. Exiting.") exit(exit_code) destination_file = os.path.join(args.source_path, "images_4", file) shutil.copy2(source_file, destination_file) exit_code = os.system(magick_command + " mogrify -resize 25% " + destination_file) if exit_code != 0: logging.error(f"25% resize failed with code {exit_code}. Exiting.") exit(exit_code) destination_file = os.path.join(args.source_path, "images_8", file) shutil.copy2(source_file, destination_file) exit_code = os.system(magick_command + " mogrify -resize 12.5% " + destination_file) if exit_code != 0: logging.error(f"12.5% resize failed with code {exit_code}. Exiting.") exit(exit_code) print("Done.") if __name__ == '__main__': # This Python script is based on the shell converter script provided in the MipNerF 360 repository. parser = ArgumentParser("Colmap converter") parser.add_argument("--no_gpu", action='store_true') parser.add_argument("--skip_matching", action='store_true') parser.add_argument("--skip_distorting", action='store_true') parser.add_argument("--source_path", "-s", required=True, type=str) parser.add_argument("--camera", default="OPENCV", type=str) parser.add_argument("--colmap_executable", default="", type=str) parser.add_argument("--resize", action="store_true") parser.add_argument("--magick_executable", default="", type=str) parser.add_argument("--existing_pose", action='store_true') parser.add_argument("--colmap_path", default="submodules.colmap", type=str) args = parser.parse_args() main(args) ================================================ FILE: process_data/convert_360_to_json.py ================================================ import os import numpy as np import json import sys from pathlib import Path from argparse import ArgumentParser import trimesh dir_path = Path(os.path.dirname(os.path.realpath(__file__))).parents[0] sys.path.append(dir_path.__str__()) from process_data.convert_data_to_json import export_to_json, get_split_dict, bound_by_pose # NOQA from submodules.colmap.scripts.python.database import COLMAPDatabase # NOQA from submodules.colmap.scripts.python.read_write_model import read_model, rotmat2qvec # NOQA def create_init_files(pinhole_dict_file, db_file, out_dir): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py if not os.path.exists(out_dir): os.mkdir(out_dir) # create template with open(pinhole_dict_file) as fp: pinhole_dict = json.load(fp) template = {} cameras_line_template = '{camera_id} RADIAL {width} {height} {f} {cx} {cy} {k1} {k2}\n' images_line_template = '{image_id} {qw} {qx} {qy} {qz} {tx} {ty} {tz} {camera_id} {image_name}\n\n' for img_name in pinhole_dict: # w, h, fx, fy, cx, cy, qvec, t params = pinhole_dict[img_name] w = params[0] h = params[1] fx = params[2] # fy = params[3] cx = params[4] cy = params[5] qvec = params[6:10] tvec = params[10:13] cam_line = cameras_line_template.format( camera_id="{camera_id}", width=w, height=h, f=fx, cx=cx, cy=cy, k1=0, k2=0) img_line = images_line_template.format(image_id="{image_id}", qw=qvec[0], qx=qvec[1], qy=qvec[2], qz=qvec[3], tx=tvec[0], ty=tvec[1], tz=tvec[2], camera_id="{camera_id}", image_name=img_name) template[img_name] = (cam_line, img_line) # read database db = COLMAPDatabase.connect(db_file) table_images = db.execute("SELECT * FROM images") img_name2id_dict = {} for row in table_images: img_name2id_dict[row[1]] = row[0] cameras_txt_lines = [template[img_name][0].format(camera_id=1)] images_txt_lines = [] for img_name, img_id in img_name2id_dict.items(): image_line = template[img_name][1].format(image_id=img_id, camera_id=1) images_txt_lines.append(image_line) with open(os.path.join(out_dir, 'cameras.txt'), 'w') as fp: fp.writelines(cameras_txt_lines) with open(os.path.join(out_dir, 'images.txt'), 'w') as fp: fp.writelines(images_txt_lines) fp.write('\n') # create an empty points3D.txt fp = open(os.path.join(out_dir, 'points3D.txt'), 'w') fp.close() def convert_cam_dict_to_pinhole_dict(cam_dict, pinhole_dict_file): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py print('Writing pinhole_dict to: ', pinhole_dict_file) h = 1080 w = 1920 pinhole_dict = {} for img_name in cam_dict: W2C = cam_dict[img_name] # params fx = 0.6 * w fy = 0.6 * w cx = w / 2.0 cy = h / 2.0 qvec = rotmat2qvec(W2C[:3, :3]) tvec = W2C[:3, 3] params = [w, h, fx, fy, cx, cy, qvec[0], qvec[1], qvec[2], qvec[3], tvec[0], tvec[1], tvec[2]] pinhole_dict[img_name] = params with open(pinhole_dict_file, 'w') as fp: json.dump(pinhole_dict, fp, indent=2, sort_keys=True) def load_COLMAP_poses(cam_file, img_dir, tf='w2c'): # load img_dir namges names = sorted(os.listdir(img_dir)) with open(cam_file) as f: lines = f.readlines() # C2W poses = {} for idx, line in enumerate(lines): if idx % 5 == 0: # header img_idx, valid, _ = line.split(' ') if valid != '-1': poses[int(img_idx)] = np.eye(4) poses[int(img_idx)] else: if int(img_idx) in poses: num = np.array([float(n) for n in line.split(' ')]) poses[int(img_idx)][idx % 5-1, :] = num if tf == 'c2w': return poses else: # convert to W2C (follow nerf convention) poses_w2c = {} for k, v in poses.items(): poses_w2c[names[k]] = np.linalg.inv(v) return poses_w2c def load_transformation(trans_file): with open(trans_file) as f: lines = f.readlines() trans = np.eye(4) for idx, line in enumerate(lines): num = np.array([float(n) for n in line.split(' ')]) trans[idx, :] = num return trans def align_gt_with_cam(pts, trans): trans_inv = np.linalg.inv(trans) pts_aligned = pts @ trans_inv[:3, :3].transpose(-1, -2) + trans_inv[:3, -1] return pts_aligned def main(args): assert args.data_path, "Provide path to 360 dataset" scene_list = os.listdir(args.data_path) scene_list = sorted(scene_list) for scene in scene_list: scene_path = os.path.join(args.data_path, scene) if not os.path.isdir(scene_path): continue cameras, images, points3D = read_model(os.path.join(scene_path, "sparse/0"), ext=".bin") trans, scale, bounding_box = bound_by_pose(images) trans = trans.tolist() export_to_json(trans, scale, scene_path, 'meta.json') print('Writing data to json file: ', os.path.join(scene_path, 'meta.json')) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--data_path', type=str, default=None, help='Path to tanks and temples dataset') parser.add_argument('--run_colmap', action='store_true', help='Run colmap') parser.add_argument('--export_json', action='store_true', help='export json') args = parser.parse_args() main(args) ================================================ FILE: process_data/convert_data_to_json.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import numpy as np from argparse import ArgumentParser import os import sys from pathlib import Path import json import trimesh dir_path = Path(os.path.dirname(os.path.realpath(__file__))).parents[0] sys.path.append(dir_path.__str__()) from submodules.colmap.scripts.python.read_write_model import read_model, qvec2rotmat # NOQA def find_closest_point(p1, d1, p2, d2): # Calculate the direction vectors of the lines d1_norm = d1 / np.linalg.norm(d1) d2_norm = d2 / np.linalg.norm(d2) # Create the coefficient matrix A and the constant vector b A = np.vstack((d1_norm, -d2_norm)).T b = p2 - p1 # Solve the linear system to find the parameters t1 and t2 t1, t2 = np.linalg.lstsq(A, b, rcond=None)[0] # Calculate the closest point on each line closest_point1 = p1 + d1_norm * t1 closest_point2 = p2 + d2_norm * t2 # Calculate the average of the two closest points closest_point = 0.5 * (closest_point1 + closest_point2) return closest_point def bound_by_pose(images): poses = [] for img in images.values(): rotation = qvec2rotmat(img.qvec) translation = img.tvec.reshape(3, 1) w2c = np.concatenate([rotation, translation], 1) w2c = np.concatenate([w2c, np.array([0, 0, 0, 1])[None]], 0) c2w = np.linalg.inv(w2c) poses.append(c2w) center = np.array([0.0, 0.0, 0.0]) for f in poses: src_frame = f[0:3, :] for g in poses: tgt_frame = g[0:3, :] p = find_closest_point(src_frame[:, 3], src_frame[:, 2], tgt_frame[:, 3], tgt_frame[:, 2]) center += p center /= len(poses) ** 2 radius = 0.0 for f in poses: radius += np.linalg.norm(f[0:3, 3]) radius /= len(poses) bounding_box = [ [center[0] - radius, center[0] + radius], [center[1] - radius, center[1] + radius], [center[2] - radius, center[2] + radius], ] return center, radius, bounding_box def bound_by_points(points3D): if not isinstance(points3D, np.ndarray): xyzs = np.stack([point.xyz for point in points3D.values()]) else: xyzs = points3D center = xyzs.mean(axis=0) std = xyzs.std(axis=0) # radius = float(std.max() * 2) # use 2*std to define the region, equivalent to 95% percentile radius = np.abs(xyzs).max(0) * 1.1 bounding_box = [ [center[0] - std[0] * 3, center[0] + std[0] * 3], [center[1] - std[1] * 3, center[1] + std[1] * 3], [center[2] - std[2] * 3, center[2] + std[2] * 3], ] return center, radius, bounding_box def compute_oriented_bound(pts): to_align, _ = trimesh.bounds.oriented_bounds(pts) scale = (np.abs((to_align[:3, :3] @ pts.vertices.T + to_align[:3, 3:]).T).max(0) * 1.2).tolist() return to_align.tolist(), scale def split_data(names, split=10): split_dict = {'train': [], 'test': []} names = sorted(names) for i, name in enumerate(names): if i % split == 0: split_dict['test'].append(name) else: split_dict['train'].append(name) split_dict['train'] = sorted(split_dict['train']) split_dict['test'] = sorted(split_dict['test']) return split_dict def get_split_dict(scene_path): split_dict = None if os.path.exists(os.path.join(scene_path, 'train_test_lists.json')): image_names = os.listdir(os.path.join(scene_path, "images")) image_names = sorted(['{:06}'.format(int(i.split(".")[0])) for i in image_names]) with open(os.path.join(scene_path, 'train_test_lists.json'), 'r') as fp: split_dict = json.load(fp) test_split = sorted([i.split(".")[0] for i in split_dict['test']]) train_split = [i for i in image_names if i not in test_split] assert len(train_split) + len(test_split) == len(image_names), "train and test split do not cover all images" split_dict = { 'train': train_split, 'test': test_split, } return split_dict def check_concentric(images, ang_tol=np.pi / 6.0, radii_tol=0.5, pose_tol=0.5): look_at = [] cam_loc = [] for img in images.values(): rotation = qvec2rotmat(img.qvec) translation = img.tvec.reshape(3, 1) w2c = np.concatenate([rotation, translation], 1) w2c = np.concatenate([w2c, np.array([0, 0, 0, 1])[None]], 0) c2w = np.linalg.inv(w2c) cam_loc.append(c2w[:3, -1]) look_at.append(c2w[:3, 2]) look_at = np.stack(look_at) look_at = look_at / np.linalg.norm(look_at, axis=1, keepdims=True) cam_loc = np.stack(cam_loc) num_images = cam_loc.shape[0] center = cam_loc.mean(axis=0) vec = center - cam_loc radii = np.linalg.norm(vec, axis=1, keepdims=True) vec_unit = vec / radii ang = np.arccos((look_at * vec_unit).sum(axis=-1, keepdims=True)) ang_valid = ang < ang_tol print(f"Fraction of images looking at the center: {ang_valid.sum()/num_images:.2f}.") radius_mean = radii.mean() radii_valid = np.isclose(radius_mean, radii, rtol=radii_tol) print(f"Fraction of images positioned around the center: {radii_valid.sum()/num_images:.2f}.") valid = ang_valid * radii_valid print(f"Valid fraction of concentric images: {valid.sum()/num_images:.2f}.") return valid.sum() / num_images > pose_tol def export_to_json(trans, scale, scene_path, file_name, split_dict=None, do_split=False): out = { "trans": trans, "scale": scale, } if do_split: if split_dict is None: image_names = os.listdir(os.path.join(scene_path, "images")) image_names = ['{:06}'.format(int(i.split(".")[0])) for i in image_names] split_dict = split_data(image_names, split=10) out.update(split_dict) with open(os.path.join(scene_path, file_name), "w") as outputfile: json.dump(out, outputfile, indent=4) return def data_to_json(args): cameras, images, points3D = read_model(os.path.join(args.data_dir, "sparse"), ext=".bin") # define bounding regions based on scene type if args.scene_type == "outdoor": if check_concentric(images): center, scale, bounding_box = bound_by_pose(images) else: center, scale, bounding_box = bound_by_points(points3D) elif args.scene_type == "indoor": # use sfm points as a proxy to define bounding regions center, scale, bounding_box = bound_by_points(points3D) elif args.scene_type == "object": # use poses as a proxy to define bounding regions center, scale, bounding_box = bound_by_pose(images) else: raise TypeError("Unknown scene type") # export json file export_to_json(list(center), scale, args.data_dir, "meta.json") print("Writing data to json file: ", os.path.join(args.data_dir, "meta.json")) return if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--data_dir", type=str, default=None, help="Path to data") parser.add_argument( "--scene_type", type=str, default="outdoor", choices=["outdoor", "indoor", "object"], help="Select scene type. Outdoor for building-scale reconstruction; " "indoor for room-scale reconstruction; object for object-centric scene reconstruction.", ) args = parser.parse_args() data_to_json(args) ================================================ FILE: process_data/convert_dtu_to_json.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import numpy as np import json from argparse import ArgumentParser import os import cv2 from PIL import Image, ImageFile from glob import glob import math import sys from pathlib import Path from tqdm import tqdm import trimesh dir_path = Path(os.path.dirname(os.path.realpath(__file__))).parents[0] sys.path.append(dir_path.__str__()) # from process_data.convert_data_to_json import _cv_to_gl # noqa: E402 from process_data.convert_data_to_json import export_to_json, compute_oriented_bound # NOQA from submodules.colmap.scripts.python.database import COLMAPDatabase # NOQA from submodules.colmap.scripts.python.read_write_model import rotmat2qvec # NOQA ImageFile.LOAD_TRUNCATED_IMAGES = True def load_K_Rt_from_P(filename, P=None): # This function is borrowed from IDR: https://github.com/lioryariv/idr if P is None: lines = open(filename).read().splitlines() if len(lines) == 4: lines = lines[1:] lines = [[x[0], x[1], x[2], x[3]] for x in (x.split(" ") for x in lines)] P = np.asarray(lines).astype(np.float32).squeeze() out = cv2.decomposeProjectionMatrix(P) K = out[0] R = out[1] t = out[2] K = K / K[2, 2] intrinsics = np.eye(4) intrinsics[:3, :3] = K pose = np.eye(4, dtype=np.float32) pose[:3, :3] = R.transpose() pose[:3, 3] = (t[:3] / t[3])[:, 0] return intrinsics, pose def dtu_to_json(args): assert args.dtu_path, "Provide path to DTU dataset" scene_list = os.listdir(args.dtu_path) test_indexes = [8, 13, 16, 21, 26, 31, 34, 56] for scene in tqdm(scene_list): scene_path = os.path.join(args.dtu_path, scene) if not os.path.isdir(scene_path) or 'scan' not in scene: continue # trans = [0., 0., 0.] # scale = 1. id = int(scene[4:]) pts = trimesh.load(os.path.join(args.dtu_path, f'Points/stl/stl{id:03}_total.ply')) trans, scale = compute_oriented_bound(pts) out = { "trans": trans, "scale": scale, } # split_dict = None if args.split: images_names = os.listdir(os.path.join(scene_path, 'images')) images_names = sorted([i for i in images_names if 'png' in i]) train_images = [i.split('.')[0] for i in images_names if int(i.split('.')[0]) not in test_indexes] test_images = [i.split('.')[0] for i in images_names if int(i.split('.')[0]) in test_indexes] train_images = sorted(train_images) test_images = sorted(test_images) out.update({ 'train': train_images, 'test': test_images, }) assert len(train_images) + len(test_images) == len(images_names) file_path = os.path.join(scene_path, 'meta.json') with open(file_path, "w") as outputfile: json.dump(out, outputfile, indent=4) # print('Writing data to json file: ', file_path) def load_poses(scene_path): camera_param = dict(np.load(os.path.join(scene_path, 'cameras_sphere.npz'))) images_lis = sorted(glob(os.path.join(scene_path, 'image/*.png'))) c2ws = {} for idx, image in enumerate(images_lis): image = os.path.basename(image) world_mat = camera_param['world_mat_%d' % idx] scale_mat = camera_param['scale_mat_%d' % idx] # scale and decompose P = world_mat @ scale_mat P = P[:3, :4] intrinsic_param, c2w = load_K_Rt_from_P(None, P) c2ws[image] = c2w w, h = Image.open(os.path.join(scene_path, 'image', image)).size return c2ws, intrinsic_param, w, h def convert_cam_dict_to_pinhole_dict(scene_path, pinhole_dict_file): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py c2ws, intrinsic_param, w, h = load_poses(scene_path) fx = intrinsic_param[0][0] fy = intrinsic_param[1][1] cx = intrinsic_param[0][2] cy = intrinsic_param[1][2] sk_x = intrinsic_param[0][1] sk_y = intrinsic_param[1][0] print('Writing pinhole_dict to: ', pinhole_dict_file) pinhole_dict = {} for img_name in c2ws: c2w = c2ws[img_name] W2C = np.linalg.inv(c2w) # params qvec = rotmat2qvec(W2C[:3, :3]) tvec = W2C[:3, 3] params = [w, h, fx, fy, cx, cy, sk_x, sk_y, qvec[0], qvec[1], qvec[2], qvec[3], tvec[0], tvec[1], tvec[2]] pinhole_dict[img_name] = params with open(pinhole_dict_file, 'w') as fp: pinhole_dict = {k: [float(x) for x in v] for k, v in pinhole_dict.items()} json.dump(pinhole_dict, fp, indent=2, sort_keys=True) def create_init_files(pinhole_dict_file, db_file, out_dir): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py if not os.path.exists(out_dir): os.mkdir(out_dir) # create template with open(pinhole_dict_file) as fp: pinhole_dict = json.load(fp) template = {} cameras_line_template = '{camera_id} RADIAL {width} {height} {fx} {fy} {cx} {cy} {k1} {k2}\n' images_line_template = '{image_id} {qw} {qx} {qy} {qz} {tx} {ty} {tz} {camera_id} {image_name}\n\n' for img_name in pinhole_dict: # w, h, fx, fy, cx, cy, qvec, t params = pinhole_dict[img_name] w = params[0] h = params[1] fx = params[2] fy = params[3] cx = params[4] cy = params[5] sk_x = params[6] sk_y = params[7] qvec = params[8:12] tvec = params[12:15] cam_line = cameras_line_template.format( camera_id="{camera_id}", width=w, height=h, fx=fx, fy=fy, cx=cx, cy=cy, k1=sk_x, k2=sk_y) img_line = images_line_template.format(image_id="{image_id}", qw=qvec[0], qx=qvec[1], qy=qvec[2], qz=qvec[3], tx=tvec[0], ty=tvec[1], tz=tvec[2], camera_id="{camera_id}", image_name=img_name) template[img_name] = (cam_line, img_line) # read database db = COLMAPDatabase.connect(db_file) table_images = db.execute("SELECT * FROM images") img_name2id_dict = {} for row in table_images: img_name2id_dict[row[1]] = row[0] cameras_txt_lines = [template[img_name][0].format(camera_id=1)] images_txt_lines = [] for img_name, img_id in img_name2id_dict.items(): image_line = template[img_name][1].format(image_id=img_id, camera_id=1) images_txt_lines.append(image_line) with open(os.path.join(out_dir, 'cameras.txt'), 'w') as fp: fp.writelines(cameras_txt_lines) with open(os.path.join(out_dir, 'images.txt'), 'w') as fp: fp.writelines(images_txt_lines) fp.write('\n') # create an empty points3D.txt fp = open(os.path.join(out_dir, 'points3D.txt'), 'w') fp.close() def init_colmap(args): assert args.dtu_path, "Provide path to DTU dataset" scene_list = os.listdir(args.dtu_path) scene_list = sorted([i for i in scene_list if 'scan' in i]) pbar = tqdm(total=len(scene_list)) for scene in scene_list: pbar.set_description(desc=f'Scene: {scene}') pbar.update(1) scene_path = os.path.join(args.dtu_path, scene) if not os.path.exists(f"{scene_path}/image"): raise Exception(f"'image` folder cannot be found in {scene_path}." "Please check the expected folder structure in DATA_PREPROCESSING.md") # extract features os.system(f"colmap feature_extractor --database_path {scene_path}/database.db \ --image_path {scene_path}/image \ --ImageReader.camera_model=RADIAL \ --SiftExtraction.use_gpu=true \ --SiftExtraction.num_threads=32 \ --ImageReader.single_camera=true" ) # --ImageReader.camera_model=RADIAL \ # match features os.system(f"colmap sequential_matcher \ --database_path {scene_path}/database.db \ --SiftMatching.use_gpu=true" ) pinhole_dict_file = os.path.join(scene_path, 'pinhole_dict.json') convert_cam_dict_to_pinhole_dict(scene_path, pinhole_dict_file) db_file = os.path.join(scene_path, 'database.db') sfm_dir = os.path.join(scene_path, 'sparse') # sfm_dir = os.path.join(scene_path, 'colmap') create_init_files(pinhole_dict_file, db_file, sfm_dir) # bundle adjustment os.system(f"colmap point_triangulator \ --database_path {scene_path}/database.db \ --image_path {scene_path}/image \ --input_path {scene_path}/sparse \ --output_path {scene_path}/sparse \ --clear_points 1 \ --Mapper.tri_ignore_two_view_tracks=true" ) os.system(f"colmap bundle_adjuster \ --input_path {scene_path}/sparse \ --output_path {scene_path}/sparse \ --BundleAdjustment.refine_extrinsics=false" ) # undistortion os.system(f"colmap image_undistorter \ --image_path {scene_path}/image \ --input_path {scene_path}/sparse \ --output_path {scene_path} \ --output_type COLMAP \ --max_image_size 1600" ) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--dtu_path', type=str, default=None) parser.add_argument('--export_json', action='store_true', help='export json') parser.add_argument('--run_colmap', action='store_true', help='export json') parser.add_argument('--split', action='store_true', help='export json') args = parser.parse_args() if args.run_colmap: init_colmap(args) if args.export_json: dtu_to_json(args) ================================================ FILE: process_data/convert_tnt_to_json.py ================================================ import os import numpy as np import json import sys from pathlib import Path from argparse import ArgumentParser import trimesh dir_path = Path(os.path.dirname(os.path.realpath(__file__))).parents[0] sys.path.append(dir_path.__str__()) from process_data.convert_data_to_json import export_to_json, get_split_dict, compute_oriented_bound # NOQA from submodules.colmap.scripts.python.database import COLMAPDatabase # NOQA from submodules.colmap.scripts.python.read_write_model import rotmat2qvec # NOQA def create_init_files(pinhole_dict_file, db_file, out_dir): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py if not os.path.exists(out_dir): os.mkdir(out_dir) # create template with open(pinhole_dict_file) as fp: pinhole_dict = json.load(fp) template = {} cameras_line_template = '{camera_id} RADIAL {width} {height} {f} {cx} {cy} {k1} {k2}\n' images_line_template = '{image_id} {qw} {qx} {qy} {qz} {tx} {ty} {tz} {camera_id} {image_name}\n\n' for img_name in pinhole_dict: # w, h, fx, fy, cx, cy, qvec, t params = pinhole_dict[img_name] w = params[0] h = params[1] fx = params[2] # fy = params[3] cx = params[4] cy = params[5] qvec = params[6:10] tvec = params[10:13] cam_line = cameras_line_template.format( camera_id="{camera_id}", width=w, height=h, f=fx, cx=cx, cy=cy, k1=0, k2=0) img_line = images_line_template.format(image_id="{image_id}", qw=qvec[0], qx=qvec[1], qy=qvec[2], qz=qvec[3], tx=tvec[0], ty=tvec[1], tz=tvec[2], camera_id="{camera_id}", image_name=img_name) template[img_name] = (cam_line, img_line) # read database db = COLMAPDatabase.connect(db_file) table_images = db.execute("SELECT * FROM images") img_name2id_dict = {} for row in table_images: img_name2id_dict[row[1]] = row[0] cameras_txt_lines = [template[img_name][0].format(camera_id=1)] images_txt_lines = [] for img_name, img_id in img_name2id_dict.items(): image_line = template[img_name][1].format(image_id=img_id, camera_id=1) images_txt_lines.append(image_line) with open(os.path.join(out_dir, 'cameras.txt'), 'w') as fp: fp.writelines(cameras_txt_lines) with open(os.path.join(out_dir, 'images.txt'), 'w') as fp: fp.writelines(images_txt_lines) fp.write('\n') # create an empty points3D.txt fp = open(os.path.join(out_dir, 'points3D.txt'), 'w') fp.close() def convert_cam_dict_to_pinhole_dict(cam_dict, pinhole_dict_file): # Partially adapted from https://github.com/Kai-46/nerfplusplus/blob/master/colmap_runner/run_colmap_posed.py print('Writing pinhole_dict to: ', pinhole_dict_file) h = 1080 w = 1920 pinhole_dict = {} for img_name in cam_dict: W2C = cam_dict[img_name] # params fx = 0.6 * w fy = 0.6 * w cx = w / 2.0 cy = h / 2.0 qvec = rotmat2qvec(W2C[:3, :3]) tvec = W2C[:3, 3] params = [w, h, fx, fy, cx, cy, qvec[0], qvec[1], qvec[2], qvec[3], tvec[0], tvec[1], tvec[2]] pinhole_dict[img_name] = params with open(pinhole_dict_file, 'w') as fp: json.dump(pinhole_dict, fp, indent=2, sort_keys=True) def load_COLMAP_poses(cam_file, img_dir, tf='w2c'): # load img_dir namges names = sorted(os.listdir(img_dir)) with open(cam_file) as f: lines = f.readlines() # C2W poses = {} for idx, line in enumerate(lines): if idx % 5 == 0: # header img_idx, valid, _ = line.split(' ') if valid != '-1': poses[int(img_idx)] = np.eye(4) poses[int(img_idx)] else: if int(img_idx) in poses: num = np.array([float(n) for n in line.split(' ')]) poses[int(img_idx)][idx % 5-1, :] = num if tf == 'c2w': return poses else: # convert to W2C (follow nerf convention) poses_w2c = {} for k, v in poses.items(): poses_w2c[names[k]] = np.linalg.inv(v) return poses_w2c def load_transformation(trans_file): with open(trans_file) as f: lines = f.readlines() trans = np.eye(4) for idx, line in enumerate(lines): num = np.array([float(n) for n in line.split(' ')]) trans[idx, :] = num return trans def align_gt_with_cam(pts, trans): trans_inv = np.linalg.inv(trans) pts_aligned = pts @ trans_inv[:3, :3].transpose(-1, -2) + trans_inv[:3, -1] return pts_aligned def compute_bound(pts): bounding_box = np.array([pts.min(axis=0), pts.max(axis=0)]) center = bounding_box.mean(axis=0) # sphere radius # scale = np.max(np.linalg.norm(pts - center, axis=-1)) * 1.01 # cube # scale = (np.abs(pts - center).max(0) * 1.2).tolist() # cuboid for street scale = (np.abs(pts - center).max(0) * 1.).tolist() # cuboid for street return center, scale, bounding_box.T.tolist() def init_colmap(args): assert args.tnt_path, "Provide path to Tanks and Temples dataset" scene_list = os.listdir(args.tnt_path) if 'Church' in scene_list: scene_list.remove('Church') scene_list = sorted(scene_list) for scene in scene_list: scene_path = os.path.join(args.tnt_path, scene) if args.run_colmap: if not os.path.exists(f"{scene_path}/images_raw"): raise Exception(f"'images_raw` folder cannot be found in {scene_path}." "Please check the expected folder structure in DATA_PREPROCESSING.md") # extract features os.system(f"colmap feature_extractor --database_path {scene_path}/database.db \ --image_path {scene_path}/images_raw \ --ImageReader.camera_model=RADIAL \ --SiftExtraction.use_gpu=true \ --SiftExtraction.num_threads=32 \ --ImageReader.single_camera=true" ) # match features os.system(f"colmap sequential_matcher \ --database_path {scene_path}/database.db \ --SiftMatching.use_gpu=true" ) # read poses poses = load_COLMAP_poses(os.path.join(scene_path, f'{scene}_COLMAP_SfM.log'), os.path.join(scene_path, 'images_raw')) # convert to colmap files pinhole_dict_file = os.path.join(scene_path, 'pinhole_dict.json') convert_cam_dict_to_pinhole_dict(poses, pinhole_dict_file) db_file = os.path.join(scene_path, 'database.db') sfm_dir = os.path.join(scene_path, 'sparse') create_init_files(pinhole_dict_file, db_file, sfm_dir) # bundle adjustment os.system(f"colmap point_triangulator \ --database_path {scene_path}/database.db \ --image_path {scene_path}/images_raw \ --input_path {scene_path}/sparse \ --output_path {scene_path}/sparse \ --Mapper.tri_ignore_two_view_tracks=true" ) os.system(f"colmap bundle_adjuster \ --input_path {scene_path}/sparse \ --output_path {scene_path}/sparse \ --BundleAdjustment.refine_extrinsics=false" ) # undistortion os.system(f"colmap image_undistorter \ --image_path {scene_path}/images_raw \ --input_path {scene_path}/sparse \ --output_path {scene_path} \ --output_type COLMAP \ --max_image_size 1500" ) if args.export_json: # read for bounding information trans = load_transformation(os.path.join(scene_path, f'{scene}_trans.txt')) pts = trimesh.load(os.path.join(scene_path, f'{scene}.ply')) # pts = pts.vertices # pts_aligned = align_gt_with_cam(pts, trans) # center, scale, bounding_box = compute_bound(pts_aligned[::100]) pts.vertices = align_gt_with_cam(pts.vertices, trans) # pts = pts.sample(20000) pts.vertices = pts.vertices[::100] trans, scale = compute_oriented_bound(pts) split_dict = get_split_dict(scene_path) export_to_json(trans, scale, scene_path, 'meta.json', split_dict=split_dict) print('Writing data to json file: ', os.path.join(scene_path, 'meta.json')) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--tnt_path', type=str, default=None, help='Path to tanks and temples dataset') parser.add_argument('--run_colmap', action='store_true', help='Run colmap') parser.add_argument('--export_json', action='store_true', help='export json') args = parser.parse_args() init_colmap(args) ================================================ FILE: process_data/extract_mask.py ================================================ import argparse import os import gc import sys import numpy as np import json import torch from PIL import Image from tqdm import tqdm import torch.nn.functional as F # segment anything from segment_anything import ( sam_model_registry, sam_hq_model_registry, SamPredictor ) import cv2 import numpy as np import matplotlib.pyplot as plt sys.path.append(os.getcwd()) from tools.semantic_id import text_label_dict text_prompt_dict = { 'indoor': 'window.floor.', 'outdoor': 'sky.', } def load_image(image_path): # load image image_pil = Image.open(image_path).convert("RGB") # load image transform = T.Compose( [ T.RandomResize([800], max_size=1333), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) image, _ = transform(image_pil, None) # 3, h, w return image_pil, image def print_(a): pass def load_model(model_config_path, model_checkpoint_path, device): args = SLConfig.fromfile(model_config_path) args.device = device model = build_model(args) checkpoint = torch.load(model_checkpoint_path, map_location="cpu") load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) print(load_res) _ = model.eval() return model def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"): caption = caption.lower() caption = caption.strip() if not caption.endswith("."): caption = caption + "." model = model.to(device) image = image.to(device) with torch.no_grad(): outputs = model(image[None], captions=[caption]) logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256) boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4) logits.shape[0] # filter output logits_filt = logits.clone() boxes_filt = boxes.clone() filt_mask = logits_filt.max(dim=1)[0] > box_threshold logits_filt = logits_filt[filt_mask] # num_filt, 256 boxes_filt = boxes_filt[filt_mask] # num_filt, 4 logits_filt.shape[0] # get phrase tokenlizer = model.tokenizer tokenized = tokenlizer(caption) # build pred pred_phrases = [] for logit, box in zip(logits_filt, boxes_filt): pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer) if with_logits: pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") else: pred_phrases.append(pred_phrase) return boxes_filt, pred_phrases def show_mask(mask, ax, random_color=False): if random_color: color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) else: color = np.array([30/255, 144/255, 255/255, 0.6]) h, w = mask.shape[-2:] mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) ax.imshow(mask_image) def show_box(box, ax, label): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) ax.text(x0, y0, label) def save_mask_data(output_dir, mask_list, box_list, label_list, name): value = 1 mask_img = torch.ones(mask_list.shape[-2:]) * value for idx, mask in enumerate(mask_list): if len(label_list) == 0: break sem = label_list[idx].split('(')[0] try: mask_img[mask.cpu().numpy()[0] == True] = text_label_dict.get(sem, value) except KeyError: import pdb; pdb.set_trace() mask_img = mask_img.numpy().astype(np.uint8) cv2.imwrite(os.path.join(output_dir, f'{name}.png'), mask_img) def morphology_open(x, k1=21, k2=21): out = x.float()[None] p1 = (k1 - 1) // 2 out = -F.max_pool2d(-out, kernel_size=k1, stride=1, padding=p1) out = F.max_pool2d(out, kernel_size=k1, stride=1, padding=p1) return out def process_image(image_name): name = image_name.split('.')[0] image_path = os.path.join(image_dir, image_name) # load image image_pil, image = load_image(image_path) # visualize raw image # image_pil.save(os.path.join(output_dir, "raw_image.jpg")) # run grounding dino model boxes_filt, pred_phrases = get_grounding_output( model, image, text_prompt, box_threshold, text_threshold, device=device ) image = cv2.imread(image_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) predictor.set_image(image) size = image_pil.size H, W = size[1], size[0] for i in range(boxes_filt.size(0)): boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 boxes_filt[i][2:] += boxes_filt[i][:2] boxes_filt = boxes_filt.cpu() transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(device) with torch.no_grad(): try: masks, _, _ = predictor.predict_torch( point_coords = None, point_labels = None, boxes = transformed_boxes.to(device), multimask_output = False, ) except RuntimeError: print(f"Error in {name}") masks = torch.zeros([1, 1, H, W]).to(device).bool() masks = masks.cpu() if args.vis: # draw output image plt.figure(figsize=(10, 10)) plt.imshow(image) for mask in masks: show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) for box, label in zip(boxes_filt, pred_phrases): show_box(box.numpy(), plt.gca(), label) plt.axis('off') plt.savefig( os.path.join(output_dir, f"{name}_output.png"), bbox_inches="tight", dpi=100, pad_inches=0.0 ) plt.close() # important!!! close the plot to release memory save_mask_data(output_dir, masks, boxes_filt, pred_phrases, name) if __name__ == "__main__": parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True) parser.add_argument("--config", type=str, required=True, help="path to config file") parser.add_argument( "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file" ) parser.add_argument( "--sam_version", type=str, default="vit_h", required=False, help="SAM ViT version: vit_b / vit_l / vit_h" ) parser.add_argument( "--sam_checkpoint", type=str, required=False, help="path to sam checkpoint file" ) parser.add_argument( "--sam_hq_checkpoint", type=str, default=None, help="path to sam-hq checkpoint file" ) parser.add_argument( "--use_sam_hq", action="store_true", help="using sam-hq for prediction" ) parser.add_argument("--input_image", type=str, required=True, help="path to image file") parser.add_argument("--text_prompt", type=str, default=None, help="text prompt") parser.add_argument("--scene_type", type=str, choices=['indoor', 'outdoor'], help="text prompt") parser.add_argument("--scene", type=str, default=None, help="text prompt") parser.add_argument( "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory" ) parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold") parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold") parser.add_argument("--gsam_path", dest="gsam_path", help="path to gsam") parser.add_argument('--vis', action='store_true', help='visualize the output') parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False") args = parser.parse_args() gsam_path = args.gsam_path sys.path.append(args.gsam_path) sys.path.append(os.path.join(gsam_path, "GroundingDINO")) sys.path.append(os.path.join(gsam_path, "segment_anything")) # Grounding DINO import GroundingDINO.groundingdino.datasets.transforms as T from GroundingDINO.groundingdino.models import build_model from GroundingDINO.groundingdino.util.slconfig import SLConfig from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap # print = print_ seed = 0 np.random.seed(seed) torch.manual_seed(seed) # sets seed on the current CPU & all GPUs # cfg config_file = args.config # change the path of the model config file grounded_checkpoint = args.grounded_checkpoint # change the path of the model sam_version = args.sam_version sam_checkpoint = args.sam_checkpoint sam_hq_checkpoint = args.sam_hq_checkpoint use_sam_hq = args.use_sam_hq image_dir = args.input_image if args.text_prompt is not None: text_prompt = args.text_prompt else: text_prompt = text_prompt_dict[args.scene_type] if args.scene is not None: text_prompt = text_prompt_dict.get(args.scene, text_prompt_dict[args.scene_type]) output_dir = args.output_dir box_threshold = args.box_threshold text_threshold = args.text_threshold device = args.device # make dir os.makedirs(output_dir, exist_ok=True) # load model model = load_model(config_file, grounded_checkpoint, device=device) image_names = os.listdir(image_dir) image_names = sorted([i for i in image_names if i.endswith(".jpg") or i.endswith(".png")]) # initialize SAM if use_sam_hq: predictor = SamPredictor(sam_hq_model_registry[sam_version](checkpoint=sam_hq_checkpoint).to(device)) else: predictor = SamPredictor(sam_model_registry[sam_version](checkpoint=sam_checkpoint).to(device)) for image_name in tqdm(image_names): process_image(image_name) ================================================ FILE: process_data/extract_normal.py ================================================ import os import sys import glob import math import struct import argparse import numpy as np import collections import torch import torch.nn.functional as F from torchvision import transforms from PIL import Image, ImageFile from tqdm import tqdm ImageFile.LOAD_TRUNCATED_IMAGES = True sys.path.append(os.getcwd()) from tools.general_utils import set_random_seed Camera = collections.namedtuple( "Camera", ["id", "model", "width", "height", "params"]) CameraModel = collections.namedtuple( "CameraModel", ["model_id", "model_name", "num_params"]) CAMERA_MODELS = { CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3), CameraModel(model_id=1, model_name="PINHOLE", num_params=4), CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4), CameraModel(model_id=3, model_name="RADIAL", num_params=5), CameraModel(model_id=4, model_name="OPENCV", num_params=8), CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8), CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12), CameraModel(model_id=7, model_name="FOV", num_params=5), CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4), CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5), CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12) } CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]) CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]) def get_args(test=False): parser = get_default_parser() #↓↓↓↓ #NOTE: project-specific args parser.add_argument('--NNET_architecture', type=str, default='v02') parser.add_argument('--NNET_output_dim', type=int, default=3, help='{3, 4}') parser.add_argument('--NNET_output_type', type=str, default='R', help='{R, G}') parser.add_argument('--NNET_feature_dim', type=int, default=64) parser.add_argument('--NNET_hidden_dim', type=int, default=64) parser.add_argument('--NNET_encoder_B', type=int, default=5) parser.add_argument('--NNET_decoder_NF', type=int, default=2048) parser.add_argument('--NNET_decoder_BN', default=False, action="store_true") parser.add_argument('--NNET_decoder_down', type=int, default=8) parser.add_argument('--NNET_learned_upsampling', default=False, action="store_true") parser.add_argument('--NRN_prop_ps', type=int, default=5) parser.add_argument('--NRN_num_iter_train', type=int, default=5) parser.add_argument('--NRN_num_iter_test', type=int, default=5) parser.add_argument('--NRN_ray_relu', default=False, action="store_true") parser.add_argument('--loss_fn', type=str, default='AL') parser.add_argument('--loss_gamma', type=float, default=0.8) parser.add_argument('--outdir', type=str, default='/your/log/path/') #↑↑↑↑ # read arguments from txt file assert '.txt' in sys.argv[1] arg_filename_with_prefix = '@' + sys.argv[1] args = parser.parse_args([arg_filename_with_prefix] + sys.argv[2:]) #↓↓↓↓ #NOTE: update args args.exp_root = os.path.join(args.outdir, 'dsine') args.load_normal = True args.load_intrins = True #↑↑↑↑ # set working dir exp_dir = os.path.join(args.exp_root, args.exp_name) args.output_dir = os.path.join(exp_dir, args.exp_id) return args def focal2fov(focal, pixels): return 2*math.atan(pixels/(2*focal)) def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"): """Read and unpack the next bytes from a binary file. :param fid: :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc. :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. :param endian_character: Any of {@, =, <, >, !} :return: Tuple of read and unpacked values. """ data = fid.read(num_bytes) return struct.unpack(endian_character + format_char_sequence, data) def read_intrinsics_binary(path_to_model_file): """ see: src/base/reconstruction.cc void Reconstruction::WriteCamerasBinary(const std::string& path) void Reconstruction::ReadCamerasBinary(const std::string& path) """ cameras = {} with open(path_to_model_file, "rb") as fid: num_cameras = read_next_bytes(fid, 8, "Q")[0] for _ in range(num_cameras): camera_properties = read_next_bytes( fid, num_bytes=24, format_char_sequence="iiQQ") camera_id = camera_properties[0] model_id = camera_properties[1] model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name width = camera_properties[2] height = camera_properties[3] num_params = CAMERA_MODEL_IDS[model_id].num_params params = read_next_bytes(fid, num_bytes=8*num_params, format_char_sequence="d"*num_params) cameras[camera_id] = Camera(id=camera_id, model=model_name, width=width, height=height, params=np.array(params)) assert len(cameras) == num_cameras return cameras def read_intrinsics_text(path): """ Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py """ cameras = {} with open(path, "r") as fid: while True: line = fid.readline() if not line: break line = line.strip() if len(line) > 0 and line[0] != "#": elems = line.split() camera_id = int(elems[0]) model = elems[1] assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE" width = int(elems[2]) height = int(elems[3]) params = np.array(tuple(map(float, elems[4:]))) cameras[camera_id] = Camera(id=camera_id, model=model, width=width, height=height, params=params) return cameras def load_intrinsic_colmap(path): intr_dir = os.path.join(path, "sparse", "0") if not os.path.exists(intr_dir): intr_dir = os.path.join(path, "sparse") # support only one camera for now try: cameras_intrinsic_file = os.path.join(intr_dir, "cameras.bin") cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file) except: cameras_intrinsic_file = os.path.join(intr_dir, "cameras.txt") cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file) intrinsics = [] for idx, key in enumerate(cam_intrinsics): intrinsic = np.eye(3) intrinsic = torch.eye(3, dtype=torch.float32) intr = cam_intrinsics[key] height = intr.height width = intr.width if intr.model=="SIMPLE_PINHOLE": focal_length_x = intr.params[0] FovY = focal2fov(focal_length_x, height) FovX = focal2fov(focal_length_x, width) elif intr.model=="PINHOLE": focal_length_x = intr.params[0] focal_length_y = intr.params[1] FovY = focal2fov(focal_length_y, height) FovX = focal2fov(focal_length_x, width) else: assert False, "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!" intrinsic[0, 0] = focal_length_x # FovX intrinsic[1, 1] = focal_length_y # FovY intrinsic[0, 2] = width / 2 intrinsic[1, 2] = height / 2 intrinsics.append(intrinsic) intrinsics = torch.stack(intrinsics, axis=0) return intrinsics def test_samples(args, model, intrins=None, device='cpu'): img_paths = glob.glob(f'{args.img_path}/*.png') + glob.glob(f'{args.img_path}/*.jpg') + glob.glob(f'{args.img_path}/*.JPG') img_paths.sort() # normalize normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) intrin = load_intrinsic_colmap(args.intrins_path).to(device) os.makedirs(args.output_path, exist_ok=True) with torch.no_grad(): for img_path in tqdm(img_paths): ext = os.path.splitext(img_path)[1] img = Image.open(img_path).convert('RGB') img = np.array(img).astype(np.float32) / 255.0 img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).to(device) _, _, orig_H, orig_W = img.shape # zero-pad the input image so that both the width and height are multiples of 32 lrtb = utils.get_padding(orig_H, orig_W) img = F.pad(img, lrtb, mode="constant", value=0.0) img = normalize(img) intrins = intrin.clone() intrins[:, 0, 2] += lrtb[0] intrins[:, 1, 2] += lrtb[2] pred_norm = model(img, intrins=intrins)[-1] pred_norm = pred_norm[:, :, lrtb[2]:lrtb[2]+orig_H, lrtb[0]:lrtb[0]+orig_W] # save to output folder img_name = os.path.basename(img_path) # NOTE: by saving the prediction as uint8 png format, you lose a lot of precision # if you want to use the predicted normals for downstream tasks, we recommend saving them as float32 NPY files pred_norm_np = pred_norm.cpu().detach().numpy()[0,:,:,:].transpose(1, 2, 0) # (H, W, 3) -1, 1 if args.vis: pred_norm_np = ((pred_norm_np + 1.0) / 2.0 * 255.0).astype(np.uint8) target_path = os.path.join(args.output_path, img_name.replace(ext, '.png')) im = Image.fromarray(pred_norm_np) im.save(target_path) else: target_path = os.path.join(args.output_path, img_name.replace(ext, '.npz')) np.savez_compressed(target_path, pred_norm_np.astype(np.float16)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--ckpt', default='dsine', type=str, help='path to model checkpoint') parser.add_argument('--mode', default='samples', type=str, help='{samples}') parser.add_argument("--dsine_path", dest="dsine_path", help="path to rgb image") parser.add_argument("--img_path", dest="img_path", help="path to rgb image") parser.add_argument("--intrins_path", dest="intrins_path", help="path to rgb image") parser.add_argument("--output_path", dest="output_path", help="path to where output image should be stored") parser.add_argument('--vis', action='store_true', help='visualize the output') args = parser.parse_args() dsine_path = args.dsine_path dsine_path = os.path.abspath(dsine_path) sys.path.append(dsine_path) # define model device = torch.device('cuda') set_random_seed(0) import utils.utils as utils from projects import get_default_parser from models.dsine.v02 import DSINE_v02 as DSINE cfg_path = f'{args.dsine_path}/projects/dsine/experiments/exp001_cvpr2024/dsine.txt' sys.argv = [sys.argv[0], cfg_path] cfg = get_args(test=True) model = DSINE(cfg).to(device) model.pixel_coords = model.pixel_coords.to(device) model = utils.load_checkpoint(args.ckpt, model) model.eval() # # # Load the normal predictor model from torch hub # model = torch.hub.load("hugoycj/DSINE-hub", "DSINE", trust_repo=True) if args.mode == 'samples': test_samples(args, model, intrins=None, device=device) ================================================ FILE: process_data/extract_normal_geo.py ================================================ # A reimplemented version in public environments by Xiao Fu and Mu Hu import os import sys import logging import argparse import numpy as np import torch from PIL import Image, ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True from tqdm.auto import tqdm if __name__=="__main__": logging.basicConfig(level=logging.INFO) '''Set the Args''' parser = argparse.ArgumentParser( description="Run MonoDepthNormal Estimation using Stable Diffusion." ) parser.add_argument("--code_path", help="path to code directory", type=str, default="~/code/geowizard/geowizard") parser.add_argument( "--pretrained_model_path", type=str, default='lemonaddie/geowizard', help="pretrained model path from hugging face or local dir", ) parser.add_argument( "--input_dir", type=str, required=True, help="Input directory." ) parser.add_argument( "--output_dir", type=str, required=True, help="Output directory." ) parser.add_argument( "--domain", type=str, default='indoor', required=True, help="domain prediction", ) # inference setting parser.add_argument( "--denoise_steps", type=int, default=10, help="Diffusion denoising steps, more steps results in higher accuracy but slower inference speed.", ) parser.add_argument( "--ensemble_size", type=int, default=10, help="Number of predictions to be ensembled, more inference gives better results but runs slower.", ) parser.add_argument( "--half_precision", action="store_true", help="Run with half-precision (16-bit float), might lead to suboptimal result.", ) # resolution setting parser.add_argument( "--processing_res", type=int, default=768, help="Maximum resolution of processing. 0 for using input image resolution. Default: 768.", ) parser.add_argument( "--output_processing_res", action="store_true", help="When input is resized, out put depth at resized operating resolution. Default: False.", ) # depth map colormap parser.add_argument( "--color_map", type=str, default="Spectral", help="Colormap used to render depth predictions.", ) # other settings parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument( "--batch_size", type=int, default=0, help="Inference batch size. Default: 0 (will be set automatically).", ) args = parser.parse_args() sys.path.append(args.code_path) from models.geowizard_pipeline import DepthNormalEstimationPipeline from utils.seed_all import seed_all from utils.depth2normal import * checkpoint_path = args.pretrained_model_path output_dir = args.output_dir denoise_steps = args.denoise_steps ensemble_size = args.ensemble_size if ensemble_size>15: logging.warning("long ensemble steps, low speed..") half_precision = args.half_precision processing_res = args.processing_res match_input_res = not args.output_processing_res domain = args.domain color_map = args.color_map seed = args.seed batch_size = args.batch_size if batch_size==0: batch_size = 1 # set default batchsize # -------------------- Preparation -------------------- # Random seed if seed is None: import time seed = int(time.time()) seed_all(seed) # Output directories output_dir_color = os.path.join(output_dir, f"depth_colored_{domain}") # output_dir_npy = os.path.join(output_dir, "depth_npy") # output_dir_normal_npy = os.path.join(output_dir, "normal_npy") output_dir_npy = os.path.join(output_dir, f"depth_npz_{domain}") output_dir_normal_npy = os.path.join(output_dir, f"normal_npz_{domain}") output_dir_normal_color = os.path.join(output_dir, f"normal_colored_{domain}") os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir_color, exist_ok=True) os.makedirs(output_dir_npy, exist_ok=True) os.makedirs(output_dir_normal_npy, exist_ok=True) os.makedirs(output_dir_normal_color, exist_ok=True) logging.info(f"output dir = {output_dir}") # -------------------- Device -------------------- if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") logging.warning("CUDA is not available. Running on CPU will be slow.") logging.info(f"device = {device}") # -------------------- Data -------------------- input_dir = args.input_dir test_files = sorted(os.listdir(input_dir)) n_images = len(test_files) if n_images > 0: logging.info(f"Found {n_images} images") else: logging.error(f"No image found") exit(1) # -------------------- Model -------------------- if half_precision: dtype = torch.float16 logging.info(f"Running with half precision ({dtype}).") else: dtype = torch.float32 # declare a pipeline pipe = DepthNormalEstimationPipeline.from_pretrained(checkpoint_path, torch_dtype=dtype) logging.info("loading pipeline whole successfully.") try: pipe.enable_xformers_memory_efficient_attention() except: pass # run without xformers pipe = pipe.to(device) # -------------------- Inference and saving -------------------- with torch.no_grad(): os.makedirs(output_dir, exist_ok=True) for test_file in tqdm(test_files, desc="Estimating Depth & Normal", leave=True): rgb_path = os.path.join(input_dir, test_file) rgb_name_base = os.path.splitext(os.path.basename(rgb_path))[0] pred_name_base = rgb_name_base # + "_pred" normal_npz_save_path = os.path.join(output_dir_normal_npy, f"{pred_name_base}.npz") if os.path.exists(normal_npz_save_path): continue # logging.warning(f"Existing file: '{normal_npz_save_path}' will be overwritten") # Read input image input_image = Image.open(rgb_path) # predict the depth here pipe_out = pipe(input_image, denoising_steps = denoise_steps, ensemble_size= ensemble_size, processing_res = processing_res, match_input_res = match_input_res, domain = domain, color_map = color_map, show_progress_bar = False, ) depth_pred: np.ndarray = pipe_out.depth_np depth_colored: Image.Image = pipe_out.depth_colored normal_pred: np.ndarray = pipe_out.normal_np normal_colored: Image.Image = pipe_out.normal_colored # Save as npy # npy_save_path = os.path.join(output_dir_npy, f"{pred_name_base}.npy") npy_save_path = os.path.join(output_dir_npy, f"{pred_name_base}.npz") if os.path.exists(npy_save_path): logging.warning(f"Existing file: '{npy_save_path}' will be overwritten") # np.save(npy_save_path, depth_pred) np.savez_compressed(npy_save_path, depth_pred) # normal_npy_save_path = os.path.join(output_dir_normal_npy, f"{pred_name_base}.npy") normal_npz_save_path = os.path.join(output_dir_normal_npy, f"{pred_name_base}.npz") if os.path.exists(normal_npz_save_path): logging.warning(f"Existing file: '{normal_npz_save_path}' will be overwritten") # np.save(normal_npy_save_path, normal_pred) np.savez_compressed(normal_npz_save_path, normal_pred) # Colorize # depth_colored_save_path = os.path.join(output_dir_color, f"{pred_name_base}_colored.png") depth_colored_save_path = os.path.join(output_dir_color, f"{pred_name_base}.png") if os.path.exists(depth_colored_save_path): logging.warning( f"Existing file: '{depth_colored_save_path}' will be overwritten" ) depth_colored.save(depth_colored_save_path) normal_colored_save_path = os.path.join(output_dir_normal_color, f"{pred_name_base}_colored.png") if os.path.exists(normal_colored_save_path): logging.warning( f"Existing file: '{normal_colored_save_path}' will be overwritten" ) normal_colored.save(normal_colored_save_path) ================================================ FILE: process_data/visualize_colmap.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "8b8d7b17-af50-42cd-b531-ef61c49c9e61", "metadata": {}, "outputs": [], "source": [ "# Set the work directory to the imaginaire root.\n", "import os, sys, time\n", "import pathlib\n", "\n", "root_dir = pathlib.Path().absolute().parents[0]\n", "os.chdir(root_dir)\n", "print(f\"Root Directory Path: {root_dir}\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "2b5b9e2f-841c-4815-92e0-0c76ed46da62", "metadata": {}, "outputs": [], "source": [ "# Import Python libraries.\n", "import numpy as np\n", "import torch\n", "import k3d\n", "import json\n", "import trimesh\n", "import plotly.graph_objs as go\n", "from collections import OrderedDict\n", "# Import imaginaire modules.\n", "from submodules.colmap.scripts.python.read_write_model import read_model\n", "# from tools import camera, visualize\n", "from tools.camera import quaternion\n", "from tools.visualize import k3d_visualize_pose, plotly_visualize_pose\n", "from process_data.convert_tnt_to_json import load_transformation, align_gt_with_cam\n", "from tools.camera_utils import cubic_camera, grid_camera, around_camera, up_camera, bb_camera\n", "from tools.math_utils import inv_normalize_pts" ] }, { "cell_type": "code", "execution_count": null, "id": "76033016-2d92-4a5d-9e50-3978553e8df4", "metadata": {}, "outputs": [], "source": [ "# Read the COLMAP data.\n", "# colmap_path = \"datasets/lego_ds2\"\n", "scene = 'Barn'\n", "colmap_path = f\"/your/path/tnt/{scene}\"\n", "# read piont clouds from lidar # point cloud\n", "pcd = trimesh.load(os.path.join(colmap_path, '{}.ply'.format(colmap_path.split('/')[-1])))\n", "# scene = 'c49a8c6cff'\n", "# colmap_path = f\"/your/path/ScanNet++/{scene}/dslr\"\n", "# pcd = trimesh.load(os.path.join(colmap_path, '../scans/mesh_aligned_0.05.ply'))\n", "view_sample_camera = False\n", "cameras, images, points_3D = read_model(path=f\"{colmap_path}/sparse\", ext=\".bin\") # w2c extrinsics\n", "# Convert camera poses.\n", "images = OrderedDict(sorted(images.items()))\n", "qvecs = torch.from_numpy(np.stack([image.qvec for image in images.values()]))\n", "tvecs = torch.from_numpy(np.stack([image.tvec for image in images.values()]))\n", "# Rs = camera.quaternion.q_to_R(qvecs)\n", "Rs = quaternion.q_to_R(qvecs)\n", "poses = torch.cat([Rs, tvecs[..., None]], dim=-1) # [N,3,4] w2c\n", "print(f\"# images: {len(poses)}\")\n", "print(\"camera height: {}\".format(poses[:, 1, 3].mean()))\n", "\n", "# # Get the sparse 3D points and the colors. colmap\n", "# xyzs = torch.from_numpy(np.stack([point.xyz for point in points_3D.values()]))\n", "# rgbs = np.stack([point.rgb for point in points_3D.values()])\n", "# rgbs_int32 = (rgbs[:, 0] * 2**16 + rgbs[:, 1] * 2**8 + rgbs[:, 2]).astype(np.uint32)\n", "# print(f\"# points: {len(xyzs)}\")\n", "\n", "\n", "if os.path.exists(os.path.join(colmap_path, f'{scene}_trans.txt')):\n", " trans = load_transformation(os.path.join(colmap_path, f'{scene}_trans.txt'))\n", " pcd.vertices = align_gt_with_cam(pcd.vertices, trans)\n", " \n", "xyzs = pcd.vertices[::500]\n", "# xyzs = pcd.vertices\n", "rgbs = np.random.randint(0, 255, xyzs.shape)\n", "rgbs_int32 = (rgbs[:, 0] * 2**16 + rgbs[:, 1] * 2**8 + rgbs[:, 2]).astype(np.uint32)\n", "print(f\"# points: {len(xyzs)}\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "47862ee1-286c-4877-a181-4b33b7733719", "metadata": {}, "outputs": [], "source": [ "vis_depth = 0.2" ] }, { "cell_type": "code", "execution_count": 5, "id": "b6cf60ec-fe6a-43ba-9aaf-e3c7afd88208", "metadata": {}, "outputs": [], "source": [ "# Visualize the bounding sphere.\n", "json_fname = f\"{colmap_path}/meta.json\"\n", "with open(json_fname) as file:\n", " meta = json.load(file)\n", "trans = np.array(meta[\"trans\"])\n", "scale = np.array(meta[\"scale\"])\n", "# ------------------------------------------------------------------------------------\n", "# These variables can be adjusted to make the bounding sphere fit the region of interest.\n", "# The adjusted values can then be set in the config as data.readjust.center and data.readjust.scale\n", "readjust_center = np.array([0., 0., 0.])\n", "readjust_scale = np.array([1., 1., 1.]) # * 1.1\n", "# save adjusted values\n", "readjust = {\n", " 'scale': readjust_scale.tolist(),\n", " 'trans': readjust_center.tolist()\n", "}\n", "redjust_fname = f'{colmap_path}/readjust.json'\n", "with open(redjust_fname, \"w\") as outputfile:\n", " json.dump(readjust, outputfile, indent=2)\n", "# ------------------------------------------------------------------------------------\n", "if trans.ndim == 1:\n", " trans += readjust_center\n", "scale *= readjust_scale\n", "# Make some points to hallucinate a bounding sphere.\n", "# sphere_points = np.random.randn(100000, 3)\n", "sphere_points = np.random.rand(100000, 3) * 2 - 1\n", "# sphere_points = sphere_points / np.linalg.norm(sphere_points, axis=-1, keepdims=True) # Unit sphere\n", "# sphere_points[:, 0] = -1 # up\n", "for i in range(3): sphere_points[i::3, i] = sphere_points[i::3, i] / np.abs(sphere_points[i::3, i]) # Unit cube\n", "sphere_points = np.concatenate([sphere_points, np.zeros([1, 3])], axis=0) # center point\n", "# sphere_points[-1, 0] = 5\n", "\n", "sphere_points = inv_normalize_pts(sphere_points, trans, scale)\n", "\n", "# sphere_points[:, 1] = -1.1\n", "\n", "# sample up cameras\n", "if view_sample_camera:\n", " height = poses[:, 1, 3].mean()\n", " # height = -1\n", " # sample_poses = cubic_camera(200, trans, scale)\n", " # sample_poses = around_camera(500, trans, scale, height)\n", " # sample_poses = bb_camera(500, trans, scale, height, up=False, around=True)\n", " sample_poses = bb_camera(200, trans, scale, height=height, up=True, around=True, bidirect=True) # , look_mode='direction'\n", " # sample_poses = up_camera(500, trans, scale)\n", " # sample_poses = grid_camera(trans, scale)\n", "\n", " # sample_poses = torch.from_numpy(poses[:, :3])\n", " sample_poses = sample_poses[:, :3]\n", "\n", " # poses = torch.cat([poses, sample_poses], dim=0)\n", " poses = sample_poses # [::6]\n", " # print(f\"# poses: {len(poses)}\")\n", "\n", " # print(f\"center: {trans[:3, 3:].T}\")\n", " # print(f\"scale: {scale}\")\n", " # print(\"up: {}\".format(trans[1, 3] - scale[1] * 0.5))\n", " # print(f\"max: {sphere_points.max(0)}\")\n", " # print(f\"min: {sphere_points.min(0)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e986aed0-1aaf-4772-937c-136db7f2eaec", "metadata": {}, "outputs": [], "source": [ "# You can choose to visualize with Plotly...\n", "x, y, z = *xyzs.T,\n", "colors = rgbs / 255.0\n", "sphere_x, sphere_y, sphere_z = *sphere_points.T,\n", "sphere_colors = [\"#4488ff\"] * len(sphere_points)\n", "sphere_size = [0.5] * len(sphere_points)\n", "sphere_colors[-1] = \"#ff0000\" # #ff4444 center point\n", "# sphere_size[-1] = 5\n", "# traces_poses = visualize.plotly_visualize_pose(poses, vis_depth=vis_depth, xyz_length=0.02, center_size=0.01, xyz_width=0.005, mesh_opacity=0.05)\n", "traces_poses = plotly_visualize_pose(poses, vis_depth=vis_depth, xyz_length=0.02, center_size=0.01, xyz_width=0.005, mesh_opacity=0.05)\n", "trace_points = go.Scatter3d(x=x, y=y, z=z, mode=\"markers\", marker=dict(size=0.4, color=colors, opacity=0.7), hoverinfo=\"skip\")\n", "trace_sphere = go.Scatter3d(x=sphere_x, y=sphere_y, z=sphere_z, mode=\"markers\", marker=dict(size=sphere_size, color=sphere_colors, opacity=0.7), hoverinfo=\"skip\")\n", "traces_all = traces_poses + [trace_points, trace_sphere]\n", "layout = go.Layout(scene=dict(xaxis=dict(showspikes=False, backgroundcolor=\"rgba(0,0,0,0)\", gridcolor=\"rgba(0,0,0,0.1)\"),\n", " yaxis=dict(showspikes=False, backgroundcolor=\"rgba(0,0,0,0)\", gridcolor=\"rgba(0,0,0,0.1)\"),\n", " zaxis=dict(showspikes=False, backgroundcolor=\"rgba(0,0,0,0)\", gridcolor=\"rgba(0,0,0,0.1)\"),\n", " xaxis_title=\"X\", yaxis_title=\"Y\", zaxis_title=\"Z\", dragmode=\"orbit\",\n", " aspectratio=dict(x=1, y=1, z=1), aspectmode=\"data\"), height=800)\n", "fig = go.Figure(data=traces_all, layout=layout)\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "fdde170b-4546-4617-9162-a9fcb936347d", "metadata": {}, "outputs": [], "source": [ "# ... or visualize with K3D.\n", "plot = k3d.plot(name=\"poses\", height=800, camera_rotate_speed=5.0, camera_zoom_speed=3.0, camera_pan_speed=1.0)\n", "# k3d_objects = visualize.k3d_visualize_pose(poses, vis_depth=vis_depth, xyz_length=0.02, center_size=0.01, xyz_width=0.005, mesh_opacity=0.05)\n", "k3d_objects = k3d_visualize_pose(poses, vis_depth=vis_depth, xyz_length=0.02, center_size=0.01, xyz_width=0.005, mesh_opacity=0.05)\n", "for k3d_object in k3d_objects:\n", " plot += k3d_object\n", "plot += k3d.points(xyzs, colors=rgbs_int32, point_size=0.02, shader=\"flat\")\n", "plot += k3d.points(sphere_points, color=0x4488ff, point_size=0.01, shader=\"flat\")\n", "plot.display()\n", "plot.camera_fov = 30.0" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: process_data/visualize_transforms.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "8b8d7b17-af50-42cd-b531-ef61c49c9e61", "metadata": {}, "outputs": [], "source": [ "# Set the work directory to the imaginaire root.\n", "import os, sys, time\n", "import pathlib\n", "root_dir = pathlib.Path().absolute().parents[2]\n", "os.chdir(root_dir)\n", "print(f\"Root Directory Path: {root_dir}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2b5b9e2f-841c-4815-92e0-0c76ed46da62", "metadata": {}, "outputs": [], "source": [ "# Import Python libraries.\n", "import numpy as np\n", "import torch\n", "import k3d\n", "import json\n", "from collections import OrderedDict\n", "# Import imaginaire modules.\n", "from projects.nerf.utils import camera, visualize\n", "from third_party.colmap.scripts.python.read_write_model import read_model" ] }, { "cell_type": "code", "execution_count": null, "id": "97bedecf-da68-44b1-96cf-580ef7e7f3f0", "metadata": {}, "outputs": [], "source": [ "# Read the COLMAP data.\n", "colmap_path = \"datasets/lego_ds2\"\n", "json_fname = f\"{colmap_path}/transforms.json\"\n", "with open(json_fname) as file:\n", " meta = json.load(file)\n", "center = meta[\"sphere_center\"]\n", "radius = meta[\"sphere_radius\"]\n", "# Convert camera poses.\n", "poses = []\n", "for frame in meta[\"frames\"]:\n", " c2w = torch.tensor(frame[\"transform_matrix\"])\n", " c2w[:, 1:3] *= -1\n", " w2c = c2w.inverse()\n", " pose = w2c[:3] # [3,4]\n", " poses.append(pose)\n", "poses = torch.stack(poses, dim=0)\n", "print(f\"# images: {len(poses)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2016d20c-1e58-407f-9810-cbe76dc5ccec", "metadata": {}, "outputs": [], "source": [ "vis_depth = 0.2\n", "k3d_textures = []" ] }, { "cell_type": "code", "execution_count": null, "id": "d7168a09-6654-4660-b140-66b9dfd6f1e8", "metadata": {}, "outputs": [], "source": [ "# (optional) visualize the images.\n", "# This block can be skipped if we don't want to visualize the image observations.\n", "for i, frame in enumerate(meta[\"frames\"]):\n", " image_fname = frame[\"file_path\"]\n", " image_path = f\"{colmap_path}/{image_fname}\"\n", " with open(image_path, \"rb\") as file:\n", " binary = file.read()\n", " # Compute the corresponding image corners in 3D.\n", " pose = poses[i]\n", " corners = torch.tensor([[-0.5, 0.5, 1], [0.5, 0.5, 1], [-0.5, -0.5, 1]])\n", " corners *= vis_depth\n", " corners = camera.cam2world(corners, pose)\n", " puv = [corners[0].tolist(), (corners[1]-corners[0]).tolist(), (corners[2]-corners[0]).tolist()]\n", " k3d_texture = k3d.texture(binary, file_format=\"jpg\", puv=puv)\n", " k3d_textures.append(k3d_texture)" ] }, { "cell_type": "code", "execution_count": null, "id": "b6cf60ec-fe6a-43ba-9aaf-e3c7afd88208", "metadata": {}, "outputs": [], "source": [ "# Visualize the bounding sphere.\n", "json_fname = f\"{colmap_path}/transforms.json\"\n", "with open(json_fname) as file:\n", " meta = json.load(file)\n", "center = meta[\"sphere_center\"]\n", "radius = meta[\"sphere_radius\"]\n", "# ------------------------------------------------------------------------------------\n", "# These variables can be adjusted to make the bounding sphere fit the region of interest.\n", "# The adjusted values can then be set in the config as data.readjust.center and data.readjust.scale\n", "readjust_center = np.array([0., 0., 0.])\n", "readjust_scale = 1.\n", "# ------------------------------------------------------------------------------------\n", "center += readjust_center\n", "radius *= readjust_scale\n", "# Make some points to hallucinate a bounding sphere.\n", "sphere_points = np.random.randn(100000, 3)\n", "sphere_points = sphere_points / np.linalg.norm(sphere_points, axis=-1, keepdims=True)\n", "sphere_points = sphere_points * radius + center" ] }, { "cell_type": "code", "execution_count": null, "id": "fdde170b-4546-4617-9162-a9fcb936347d", "metadata": {}, "outputs": [], "source": [ "# Visualize with K3D.\n", "plot = k3d.plot(name=\"poses\", height=800, camera_rotate_speed=5.0, camera_zoom_speed=3.0, camera_pan_speed=1.0)\n", "k3d_objects = visualize.k3d_visualize_pose(poses, vis_depth=vis_depth, xyz_length=0.02, center_size=0.01, xyz_width=0.005, mesh_opacity=0.)\n", "for k3d_object in k3d_objects:\n", " plot += k3d_object\n", "for k3d_texture in k3d_textures:\n", " plot += k3d_texture\n", "plot += k3d.points(sphere_points, color=0x4488ff, point_size=0.01, shader=\"flat\")\n", "plot.display()\n", "plot.camera_fov = 30.0" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: pyproject.toml ================================================ [tool.black] line-length = 240 [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" [project] name = "vcr-gaus" version = "0.0.0.dev0" description = "VCR-GauS: View Consistent Depth-Normal Regularizer for Gaussian Surface Reconstruction" readme = "README.md" requires-python = ">=3.8" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", ] [project.optional-dependencies] f1eval = [ "open3d==0.10.0", "numpy" ] train = [ "torch==2.0.1", "torchvision==0.15.2", "torchaudio==2.0.2", "numpy==1.26.1", "open3d", "plyfile", "ninja", "GPUtil", "opencv-python", "lpips", "trimesh", "pymeshlab", "termcolor", "wandb", "imageio", "scikit-image", "torchmetrics", "mediapy", ] [project.urls] "Homepage" = "https://hlinchen.github.io/projects/VCR-GauS/" "Bug Tracker" = "https://github.com/HLinChen/VCR-GauS/issues" [tool.setuptools.packages.find] include = ["vcr*", "trl*"] exclude = [ "assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*", "checkpoints*", "project_checkpoints*", "debug_checkpoints*", "mlx_configs*", "wandb*", "notebooks*", ] [tool.wheel] exclude = [ "assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*", "checkpoints*", "project_checkpoints*", "debug_checkpoints*", "mlx_configs*", "wandb*", "notebooks*", ] ================================================ FILE: python_scripts/run_base.py ================================================ import os import time import GPUtil def worker(gpu, scene, factor, fn): print(f"Starting job on GPU {gpu} with scene {scene}\n") fn(gpu, scene, factor) print(f"Finished job on GPU {gpu} with scene {scene}\n") # This worker function starts a job and returns when it's done. def dispatch_jobs(jobs, executor, excluded_gpus, fn): future_to_job = {} reserved_gpus = set() # GPUs that are slated for work but may not be active yet while jobs or future_to_job: # Get the list of available GPUs, not including those that are reserved. all_available_gpus = set(GPUtil.getAvailable(order="first", limit=10, maxMemory=0.1, maxLoad=0.1)) available_gpus = list(all_available_gpus - reserved_gpus - excluded_gpus) # Launch new jobs on available GPUs while available_gpus and jobs: gpu = available_gpus.pop(0) job = jobs.pop(0) future = executor.submit(worker, gpu, *job, fn) # Unpacking job as arguments to worker future_to_job[future] = (gpu, job) reserved_gpus.add(gpu) # Reserve this GPU until the job starts processing # Check for completed jobs and remove them from the list of running jobs. # Also, release the GPUs they were using. done_futures = [future for future in future_to_job if future.done()] for future in done_futures: job = future_to_job.pop(future) # Remove the job associated with the completed future gpu = job[0] # The GPU is the first element in each job tuple reserved_gpus.discard(gpu) # Release this GPU print(f"Job {job} has finished., rellasing GPU {gpu}") # (Optional) You might want to introduce a small delay here to prevent this loop from spinning very fast # when there are no GPUs available. time.sleep(5) print("All jobs have been processed.") def check_finish(scene, path, type='mesh'): if not os.path.exists(path): print(f"Scene \033[1;31m{scene}\033[0m failed in \033[1;31m{type}\033[0m") return False return True train_cmd = "OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES={gpu} \ python train.py \ --config=configs/{dataset}/{cfg}.yaml \ --logdir={log_dir} \ --model.source_path={data_dir}/{scene}/ \ --train.debug_from={debug_from} \ --model.data_device={data_device} \ --model.resolution={resolution} \ --wandb \ --wandb_name {project}" train_cmd_new = "OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES={gpu} \ python train.py \ --config={cfg} \ --logdir={log_dir} \ --model.source_path={data_dir}/{scene}/ \ --train.debug_from={debug_from} \ --model.data_device={data_device} \ --model.resolution={resolution} \ --wandb \ --wandb_name {project}" extract_mesh_cmd = "OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES={gpu} \ python tools/depth2mesh.py \ --mesh_name {ply} \ --split {step} \ --method {fuse_method} \ --voxel_size {voxel_size} \ --num_cluster {num_cluster} \ --max_depth {max_depth} \ --clean \ --prob_thres {prob_thr} \ --cfg_path {log_dir}/config.yaml" eval_tnt_cmd = "OMP_NUM_THREADS={num_threads} CUDA_VISIBLE_DEVICES={gpu} \ conda run -n {eval_env} \ python evaluation/tnt_eval/run.py \ --dataset-dir {data_dir}/{scene}/ \ --traj-path {data_dir}/{scene}/{scene}_COLMAP_SfM.log \ --ply-path {log_dir}/{ply} > {log_dir}/fscore.txt" eval_cd_cmd = "OMP_NUM_THREADS={num_threads} CUDA_VISIBLE_DEVICES={gpu} \ python evaluation/eval_dtu/evaluate_single_scene.py \ --input_mesh {tri_mesh_path} \ --scan_id {scan_id} --output_dir {output_dir} \ --mask_dir {data_dir} \ --DTU {data_dir}" render_cmd = "CUDA_VISIBLE_DEVICES={gpu} \ python evaluation/render.py \ --cfg_path {log_dir}/config.yaml \ --iteration 30000 \ --skip_train" eval_psnr_cmd = "CUDA_VISIBLE_DEVICES={gpu} \ python evaluation/metrics.py \ --cfg_path {log_dir}/config.yaml" eval_replica_cmd = "OMP_NUM_THREADS={num_threads} CUDA_VISIBLE_DEVICES={gpu} \ python evaluation/replica_eval/evaluate_single_scene.py \ --input_mesh {tri_mesh_path} \ --scene {scene} \ --output_dir {output_dir} \ --data_dir {data_dir}" ================================================ FILE: python_scripts/run_dtu.py ================================================ # training scripts for the TNT datasets import os import sys import time from concurrent.futures import ThreadPoolExecutor sys.path.append(os.getcwd()) from python_scripts.run_base import dispatch_jobs, train_cmd, extract_mesh_cmd, eval_cd_cmd, check_finish from python_scripts.show_dtu import show_matrix TRIAL_NAME = 'vcr_gaus' PROJECT = 'vcr_gaus' PROJECT_wandb = 'vcr_gaus_dtu' DATASET = 'dtu' base_dir = "/your/path" output_dir = f"{base_dir}/output/{PROJECT}/{DATASET}" data_dir = f"{base_dir}/data/DTU_mask" do_train = False do_extract_mesh = False do_cd = True dry_run = False node = 0 max_workers = 15 be = node*max_workers excluded_gpus = set([]) total_list = [ 'scan24', 'scan37', 'scan40', 'scan55', 'scan63', 'scan65', 'scan69', 'scan83', 'scan97', 'scan105', 'scan106', 'scan110', 'scan114', 'scan118', 'scan122' ] training_list = [ 'scan24', 'scan37', 'scan40', 'scan55', 'scan63', 'scan65', 'scan69', 'scan83', 'scan97', 'scan105', 'scan106', 'scan110', 'scan114', 'scan118', 'scan122' ] training_list = training_list[be: be + max_workers] scenes = training_list factors = [-1] * len(scenes) debug_from = -1 eval_env = 'pt' data_device = 'cuda' voxel_size = 0.004 step = 1 PLY = f"ours.ply" TOTAL_THREADS = 64 NUM_THREADS = TOTAL_THREADS // max_workers prob_thr = 0.15 num_cluster = 1 max_depth = 3 fuse_method = 'tsdf_cpu' jobs = list(zip(scenes, factors)) def train_scene(gpu, scene, factor): time.sleep(2*gpu) os.system('ulimit -n 9000') log_dir = f"{output_dir}/{scene}/{TRIAL_NAME}" fail = 0 if not dry_run: if do_train: cmd = train_cmd.format(gpu=gpu, dataset=DATASET, cfg='base', scene=scene, log_dir=log_dir, data_dir=data_dir, debug_from=debug_from, data_device=data_device, resolution=factor, project=PROJECT_wandb) print(cmd) fail = os.system(cmd) if fail == 0: if not dry_run: # fusion if do_extract_mesh: if not check_finish(scene, f"{log_dir}/point_cloud", 'train'): return False cmd = extract_mesh_cmd.format(gpu=gpu, ply=PLY, step=step, fuse_method=fuse_method, voxel_size=voxel_size, num_cluster=num_cluster, max_depth=max_depth, log_dir=log_dir, prob_thr=prob_thr) fail = os.system(cmd) print(cmd) # evaluation # evaluate the mesh scan_id = scene[4:] cmd = eval_cd_cmd.format(num_threads=NUM_THREADS, gpu=gpu, tri_mesh_path=f'{log_dir}/{PLY}', scan_id=scan_id, output_dir=log_dir, data_dir=data_dir) if fail == 0: if not dry_run: if do_cd: if not check_finish(scene, f"{log_dir}/{PLY}", 'mesh'): return False print(cmd) fail = os.system(cmd) if not check_finish(scene, f"{log_dir}/results.json", 'cd'): return False return fail == 0 # Using ThreadPoolExecutor to manage the thread pool with ThreadPoolExecutor(max_workers) as executor: dispatch_jobs(jobs, executor, excluded_gpus, train_scene) show_matrix(total_list, [output_dir], TRIAL_NAME) print(TRIAL_NAME, " done") ================================================ FILE: python_scripts/run_mipnerf360.py ================================================ # Training script for the Mip-NeRF 360 dataset import os import sys import time from concurrent.futures import ThreadPoolExecutor sys.path.append(os.getcwd()) from python_scripts.run_base import dispatch_jobs, train_cmd, extract_mesh_cmd, check_finish, render_cmd, eval_psnr_cmd from python_scripts.show_360 import show_matrix TRIAL_NAME = 'vcr_gaus' PROJECT = 'vcr_gaus' PROJECT_wandb = 'vcr_gaus_360' do_train = True do_render = True do_eval = True do_extract_mesh = True dry_run = False node = 0 max_workers = 9 be = node*max_workers excluded_gpus = set([]) total_list = [ "bicycle", "bonsai", "counter", "flowers", "garden", "stump", "treehill", "kitchen", "room" ] training_list = [ "bicycle", "bonsai", "counter", "flowers", "garden", "stump", "treehill", "kitchen", "room" ] training_list = training_list[be: be + max_workers] scenes = training_list factors = [-1] * len(scenes) debug_from = -1 DATASET = '360_v2' eval_env = 'pt' data_device = 'cpu' step = 1 max_depth = 6.0 voxel_size = 8e-3 PLY = f"fused_mesh_split{step}.ply" TOTAL_THREADS = 64 NUM_THREADS = TOTAL_THREADS // max_workers prob_thr = 0.15 num_cluster = 1000 fuse_method = 'tsdf' base_dir = "/your/path" output_dir = f"{base_dir}/output/{PROJECT}/{DATASET}" data_dir = f"{base_dir}/data/{DATASET}" jobs = list(zip(scenes, factors)) def train_scene(gpu, scene, factor): time.sleep(2*gpu) os.system('ulimit -n 9000') log_dir = f"{output_dir}/{scene}/{TRIAL_NAME}" fail = 0 if not dry_run: if do_train: cmd = train_cmd.format(gpu=gpu, dataset=DATASET, cfg='base', scene=scene, log_dir=log_dir, data_dir=data_dir, debug_from=debug_from, data_device=data_device, resolution=factor, project=PROJECT_wandb) print(cmd) fail = os.system(cmd) if fail == 0: if not dry_run: # render cmd = render_cmd.format(gpu=gpu, log_dir=log_dir) if fail == 0: if not dry_run: if do_render: print(cmd) fail = os.system(cmd) if not check_finish(scene, f"{log_dir}/test/ours_30000/renders", 'render'): return False # eval cmd = eval_psnr_cmd.format(gpu=gpu, log_dir=log_dir) if fail == 0: if not dry_run: if do_eval: print(cmd) fail = os.system(cmd) if not check_finish(scene, f"{log_dir}/results.json", 'eval'): return False # fusion if do_extract_mesh: if not check_finish(scene, f"{log_dir}/point_cloud", 'train'): return False cmd = extract_mesh_cmd.format(gpu=gpu, ply=PLY, step=step, fuse_method=fuse_method, voxel_size=voxel_size, num_cluster=num_cluster, max_depth=max_depth, log_dir=log_dir, prob_thr=prob_thr) fail = os.system(cmd) print(cmd) return fail == 0 # Using ThreadPoolExecutor to manage the thread pool with ThreadPoolExecutor(max_workers) as executor: dispatch_jobs(jobs, executor, excluded_gpus, train_scene) show_matrix(total_list, [output_dir], TRIAL_NAME) print(TRIAL_NAME, " done") ================================================ FILE: python_scripts/run_tnt.py ================================================ # training scripts for the TNT datasets import os import sys import time from concurrent.futures import ThreadPoolExecutor sys.path.append(os.getcwd()) from python_scripts.run_base import dispatch_jobs, train_cmd, extract_mesh_cmd, eval_tnt_cmd, check_finish from python_scripts.show_tnt import show_matrix TRIAL_NAME = 'vcr_gaus' PROJECT = 'vcr_gaus' DATASET = 'tnt' base_dir = "/your/path" output_dir = f"{base_dir}/output/{PROJECT}/{DATASET}" data_dir = f"{base_dir}/data/{DATASET}" do_train = True do_extract_mesh = True do_f1 = True dry_run = False node = 0 max_workers = 4 be = node*max_workers excluded_gpus = set([]) total_list = [ 'Barn', 'Caterpillar', 'Courthouse', 'Ignatius', 'Meetingroom', 'Truck' ] training_list = [ 'Barn', 'Caterpillar', 'Courthouse', 'Ignatius', 'Meetingroom', 'Truck' ] training_list = training_list[be: be + max_workers] scenes = training_list factors = [1] * len(scenes) debug_from = -1 # enable wandb eval_env = 'f1eval' data_device = 'cpu' step = 3 voxel_size = [0.02, 0.015, 0.01] + [x / 1000.0 for x in range(2, 10, 1)][::-1] voxel_size = sorted(voxel_size) PLY = f"ours.ply" TOTAL_THREADS = 128 NUM_THREADS = TOTAL_THREADS // max_workers prob_thr = 0.3 num_cluster = 1000 fuse_method = 'tsdf' max_depth = 8 jobs = list(zip(scenes, factors)) def train_scene(gpu, scene, factor): time.sleep(2*gpu) os.system('ulimit -n 9000') log_dir = f"{output_dir}/{scene}/{TRIAL_NAME}" fail = 0 if not dry_run: if do_train: cmd = train_cmd.format(gpu=gpu, dataset=DATASET, cfg=scene, scene=scene, log_dir=log_dir, data_dir=data_dir, debug_from=debug_from, data_device=data_device, resolution=factor, project=PROJECT) print(cmd) fail = os.system(cmd) if fail == 0: if not dry_run: # fusion if do_extract_mesh: if not check_finish(scene, f"{log_dir}/point_cloud", 'train'): return False for vs in voxel_size: cmd = extract_mesh_cmd.format(gpu=gpu, ply=PLY, step=step, fuse_method=fuse_method, voxel_size=vs, num_cluster=num_cluster, max_depth=max_depth, log_dir=log_dir, prob_thr=prob_thr) fail = os.system(cmd) if fail == 0: break print(cmd) # evaluation # You need to install open3d==0.9 for evaluation # evaluate the mesh cmd = eval_tnt_cmd.format(num_threads=NUM_THREADS, gpu=gpu, eval_env=eval_env, data_dir=data_dir, scene=scene, log_dir=log_dir, ply=PLY) if fail == 0: if not dry_run: if do_f1: if not check_finish(scene, f"{log_dir}/{PLY}", 'mesh'): return False print(cmd) fail = os.system(cmd) if not check_finish(scene, f"{log_dir}/evaluation/evaluation.txt", 'f1'): return False # return True return fail == 0 # Using ThreadPoolExecutor to manage the thread pool with ThreadPoolExecutor(max_workers) as executor: dispatch_jobs(jobs, executor, excluded_gpus, train_scene) show_matrix(total_list, [output_dir], TRIAL_NAME) print(TRIAL_NAME, " done") ================================================ FILE: python_scripts/show_360.py ================================================ import json import numpy as np scenes = ['bicycle', 'flowers', 'garden', 'stump', 'treehill', 'room', 'counter', 'kitchen', 'bonsai'] output_dirs = ["exp_360/release"] outdoor_scenes = ["bicycle", "flowers", "garden", "stump", "treehill"] indoor_scenes = ["room", "counter", "kitchen", "bonsai"] all_metrics = {"PSNR": [], "SSIM": [], "LPIPS": [], 'scene': []} indoor_metrics = {"PSNR": [], "SSIM": [], "LPIPS": [], 'scene': []} outdoor_metrics = {"PSNR": [], "SSIM": [], "LPIPS": [], 'scene': []} TRIAL_NAME = 'vcr_gaus' def show_matrix(scenes, output_dirs, TRIAL_NAME): for scene in scenes: for output in output_dirs: json_file = f"{output}/{scene}/{TRIAL_NAME}/results.json" data = json.load(open(json_file)) data = data['ours_30000'] for k in ["PSNR", "SSIM", "LPIPS"]: all_metrics[k].append(data[k]) if scene in indoor_scenes: indoor_metrics[k].append(data[k]) else: outdoor_metrics[k].append(data[k]) all_metrics['scene'].append(scene) if scene in indoor_scenes: indoor_metrics['scene'].append(scene) else: outdoor_metrics['scene'].append(scene) latex = [] for k in ["PSNR", "SSIM", "LPIPS"]: numbers = np.asarray(all_metrics[k]).mean(axis=0).tolist() numbers = [numbers] if k == "PSNR": numbers = [f"{x:.2f}" for x in numbers] else: numbers = [f"{x:.3f}" for x in numbers] latex.extend([k+': ', numbers[-1]+' ']) indoor_latex = [] for k in ["PSNR", "SSIM", "LPIPS"]: numbers = np.asarray(indoor_metrics[k]).mean(axis=0).tolist() numbers = [numbers] if k == "PSNR": numbers = [f"{x:.2f}" for x in numbers] else: numbers = [f"{x:.3f}" for x in numbers] indoor_latex.extend([k+': ', numbers[-1]+' ']) outdoor_latex = [] for k in ["PSNR", "SSIM", "LPIPS"]: numbers = np.asarray(outdoor_metrics[k]).mean(axis=0).tolist() numbers = [numbers] if k == "PSNR": numbers = [f"{x:.2f}" for x in numbers] else: numbers = [f"{x:.3f}" for x in numbers] outdoor_latex.extend([k+': ', numbers[-1]+' ']) print('Outdoor scenes') for i in range(len(outdoor_metrics['scene'])): print('PSNR: {:.3f}, SSIM: {:.3f}, LPIPS: {:.3f}, scene: {}'.format(outdoor_metrics['PSNR'][i], outdoor_metrics['SSIM'][i], outdoor_metrics['LPIPS'][i], outdoor_metrics['scene'][i])) print('Indoor scenes') for i in range(len(indoor_metrics['scene'])): print('PSNR: {:.3f}, SSIM: {:.3f}, LPIPS: {:.3f}, scene: {}'.format(indoor_metrics['PSNR'][i], indoor_metrics['SSIM'][i], indoor_metrics['LPIPS'][i], indoor_metrics['scene'][i])) print('Outdoor:') print("".join(outdoor_latex)) print('Indoor:') print("".join(indoor_latex)) if __name__ == "__main__": show_matrix(scenes, output_dirs, TRIAL_NAME) ================================================ FILE: python_scripts/show_dtu.py ================================================ import os import json import numpy as np scenes = [24, 37, 40, 55, 63, 65, 69, 83, 97, 105, 106, 110, 114, 118, 122] output_dirs = ["exp_dtu/release"] TRIAL_NAME = 'vcr_gaus' def show_matrix_old(scenes, output_dirs, TRIAL_NAME): all_metrics = {"mean_d2s": [], "mean_s2d": [], "overall": []} print(output_dirs) for scene in scenes: print(scene,end=" ") for output in output_dirs: json_file = f"{output}/scan{scene}/test/ours_30000/tsdf/results.json" data = json.load(open(json_file)) for k in ["mean_d2s", "mean_s2d", "overall"]: all_metrics[k].append(data[k]) print(f"{data[k]:.3f}", end=" ") print() latex = [] for k in ["mean_d2s", "mean_s2d", "overall"]: numbers = np.asarray(all_metrics[k]).mean(axis=0).tolist() numbers = all_metrics[k] + [numbers] numbers = [f"{x:.2f}" for x in numbers] if k == "overall": latex.extend(numbers) print(" & ".join(latex)) def show_matrix(scenes, output_dirs, TRIAL_NAME): all_metrics = {"mean_d2s": [], "mean_s2d": [], "overall": [], 'scene': []} for scene in scenes: for output in output_dirs: json_file = f"{output}/{scene}/{TRIAL_NAME}/results.json" if not os.path.exists(json_file): print(f"Scene \033[1;31m{scene}\033[0m was not evaluated.") continue data = json.load(open(json_file)) for k in ["mean_d2s", "mean_s2d", "overall"]: all_metrics[k].append(data[k]) all_metrics['scene'].append(scene) latex = [] for k in ["mean_d2s", "mean_s2d", "overall"]: numbers = np.asarray(all_metrics[k]).mean(axis=0).tolist() numbers = all_metrics[k] + [numbers] numbers = [f"{x:.2f}" for x in numbers] latex.extend([k+': ', numbers[-1]+' ']) for i in range(len(all_metrics['scene'])): print('d2s: {:.3f}, s2d: {:.3f}, overall: {:.3f}, scene: {}'.format(all_metrics['mean_d2s'][i], all_metrics['mean_s2d'][i], all_metrics['overall'][i], all_metrics['scene'][i])) print("".join(latex)) if __name__ == "__main__": show_matrix(scenes, output_dirs, TRIAL_NAME) ================================================ FILE: python_scripts/show_tnt.py ================================================ import os import numpy as np training_list = [ 'Barn', 'Caterpillar', 'Courthouse', 'Ignatius', 'Meetingroom', 'Truck' ] scenes = training_list DATASET = 'tnt' base_dir = "/your/log/path/" TRIAL_NAME = 'vcr_gaus' PROJECT = 'sq_gs' output_dirs = [f"{base_dir}/{PROJECT}/{DATASET}"] def show_matrix(scenes, output_dirs, TRIAL_NAME): all_metrics = {"precision": [], "recall": [], "f-score": [], 'scene': []} for scene in scenes: for output in output_dirs: # precision eval_file = os.path.join(output, scene, f"{TRIAL_NAME}/evaluation/evaluation.txt") if not os.path.exists(eval_file): print(f"Scene \033[1;31m{scene}\033[0m was not evaluated.") continue with open(eval_file, 'r') as f: matrix = f.readlines() precision = float(matrix[2].split(" ")[-1]) recall = float(matrix[3].split(" ")[-1]) f_score = float(matrix[4].split(" ")[-1]) all_metrics["precision"].append(precision) all_metrics["recall"].append(recall) all_metrics["f-score"].append(f_score) all_metrics['scene'].append(scene) latex = [] for k in ["precision","recall", "f-score"]: numbers = all_metrics[k] mean = np.mean(numbers) numbers = numbers + [mean] numbers = [f"{x:.3f}" for x in numbers] latex.extend([k+': ', numbers[-1]+' ']) for i in range(len(all_metrics['scene'])): print('precision: {:.3f}, recall: {:.3f}, f-score: {:.3f}, scene: {}'.format(all_metrics['precision'][i], all_metrics['recall'][i], all_metrics['f-score'][i], all_metrics['scene'][i])) print("".join(latex)) return if __name__ == "__main__": show_matrix(scenes, output_dirs, TRIAL_NAME) ================================================ FILE: requirements.txt ================================================ submodules/diff-gaussian-rasterization submodules/simple-knn/ git+https://github.com/facebookresearch/pytorch3d.git@stable ================================================ FILE: scene/__init__.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os import random import json import torch from arguments import ModelParams from scene.gaussian_model import GaussianModel from tools.system_utils import searchForMaxIteration from scene.dataset_readers import sceneLoadTypeCallbacks from tools.camera_utils import cameraList_from_camInfos, camera_to_JSON from tools.graphics_utils import get_all_px_dir class Scene: gaussians : GaussianModel def __init__(self, args : ModelParams, gaussians : GaussianModel, load_iteration=None, shuffle=True, resolution_scales=[1.0]): """b :param path: Path to colmap scene main folder. """ self.model_path = args.model_path self.loaded_iter = None self.gaussians = gaussians self.split = args.split load_depth = args.load_depth load_normal = args.load_normal load_mask = args.load_mask if load_iteration: if load_iteration == -1: self.loaded_iter = searchForMaxIteration(os.path.join(self.model_path, "point_cloud")) else: self.loaded_iter = load_iteration print("Loading trained model at iteration {}".format(self.loaded_iter)) self.train_cameras = {} self.test_cameras = {} if os.path.exists(os.path.join(args.source_path, "sparse")): scene_info = sceneLoadTypeCallbacks["Colmap"](args.source_path, args.images, args.eval, args.llffhold, args.ratio, split=self.split, load_depth=load_depth, load_normal=load_normal, load_mask=load_mask, normal_folder=args.normal_folder, depth_folder=args.depth_folder) elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")): print("Found transforms_train.json file, assuming Blender data set!") scene_info = sceneLoadTypeCallbacks["Blender"](args.source_path, args.white_background, args.eval) else: assert False, "Could not recognize scene type!" self.trans = scene_info.trans self.scale = scene_info.scale if not self.loaded_iter: with open(scene_info.ply_path, 'rb') as src_file, open(os.path.join(self.model_path, "input.ply") , 'wb') as dest_file: dest_file.write(src_file.read()) json_cams = [] camlist = [] if scene_info.test_cameras: camlist.extend(scene_info.test_cameras) if scene_info.train_cameras: camlist.extend(scene_info.train_cameras) for id, cam in enumerate(camlist): json_cams.append(camera_to_JSON(id, cam)) with open(os.path.join(self.model_path, "cameras.json"), 'w') as file: json.dump(json_cams, file) if shuffle: random.shuffle(scene_info.train_cameras) # Multi-res consistent random shuffling # random.shuffle(scene_info.test_cameras) # Multi-res consistent random shuffling self.cameras_extent = scene_info.nerf_normalization["radius"] gaussians.extent = self.cameras_extent for resolution_scale in resolution_scales: print("Loading Training Cameras") self.train_cameras[resolution_scale] = cameraList_from_camInfos(scene_info.train_cameras, resolution_scale, args) print("Loading Test Cameras") self.test_cameras[resolution_scale] = cameraList_from_camInfos(scene_info.test_cameras, resolution_scale, args) for idx, camera in enumerate(self.train_cameras[resolution_scale] + self.test_cameras[resolution_scale]): camera.idx = idx if self.loaded_iter: self.gaussians.load_ply(os.path.join(self.model_path, "point_cloud", "iteration_" + str(self.loaded_iter), "point_cloud.ply")) else: self.gaussians.create_from_pcd(scene_info.point_cloud, self.cameras_extent) if args.depth_type == "traditional": self.dirs = None elif args.depth_type == "intersection": self.dirs = get_all_px_dir(self.getTrainCameras()[0].intr, self.getTrainCameras()[0].image_height, self.getTrainCameras()[0].image_width).cuda() self.first_name = scene_info.first_name def save(self, iteration, visi=None, surf=None, save_splat=False): point_cloud_path = os.path.join(self.model_path, "point_cloud/iteration_{}".format(iteration)) self.gaussians.save_ply(os.path.join(point_cloud_path, "point_cloud.ply")) self.gaussians.save_inside_ply(os.path.join(point_cloud_path, "point_cloud_inside.ply")) if visi is not None: self.gaussians.save_visi_ply(os.path.join(point_cloud_path, "visi.ply"), visi) if surf is not None: self.gaussians.save_visi_ply(os.path.join(point_cloud_path, "surf.ply"), surf) if save_splat: self.gaussians.save_splat(os.path.join(point_cloud_path, "pcd.splat")) def getTrainCameras(self, scale=1.0): return self.train_cameras[scale] def getTestCameras(self, scale=1.0): return self.test_cameras[scale] def getFullCameras(self, scale=1.0): if self.split: return self.train_cameras[scale] + self.test_cameras[scale] else: return self.train_cameras[scale] def getUpCameras(self): return self.random_cameras_up def getAroundCameras(self): return self.random_cameras_around def getRandCameras(self, n, up=False, around=True, sample_mode='uniform'): if up and around: n = n // 2 cameras = [] if up: up_cameras = self.getUpCameras().copy() idx = torch.randperm(len(up_cameras))[: n] if n == 1: cameras.append(up_cameras[idx]) else: cameras.extend(up_cameras[idx]) if around: around_cameras = self.getAroundCameras() if sample_mode == 'random': idx = torch.randperm(len(around_cameras))[: n] elif sample_mode == 'uniform': idx = torch.arange(len(around_cameras))[::len(around_cameras)//n] else: assert False, f"Unknown sample_mode: {sample_mode}" if n == 1: cameras.append(around_cameras[idx]) else: cameras.extend(around_cameras[idx]) return cameras ================================================ FILE: scene/appearance_network.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F class UpsampleBlock(nn.Module): def __init__(self, num_input_channels, num_output_channels): super(UpsampleBlock, self).__init__() self.pixel_shuffle = nn.PixelShuffle(2) self.conv = nn.Conv2d(num_input_channels // (2 * 2), num_output_channels, 3, stride=1, padding=1) self.relu = nn.ReLU() def forward(self, x): x = self.pixel_shuffle(x) x = self.conv(x) x = self.relu(x) return x class AppearanceNetwork(nn.Module): def __init__(self, num_input_channels, num_output_channels): super(AppearanceNetwork, self).__init__() self.conv1 = nn.Conv2d(num_input_channels, 256, 3, stride=1, padding=1) self.up1 = UpsampleBlock(256, 128) self.up2 = UpsampleBlock(128, 64) self.up3 = UpsampleBlock(64, 32) self.up4 = UpsampleBlock(32, 16) self.conv2 = nn.Conv2d(16, 16, 3, stride=1, padding=1) self.conv3 = nn.Conv2d(16, num_output_channels, 3, stride=1, padding=1) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.conv1(x) x = self.relu(x) x = self.up1(x) x = self.up2(x) x = self.up3(x) x = self.up4(x) # bilinear interpolation x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) x = self.conv2(x) x = self.relu(x) x = self.conv3(x) x = self.sigmoid(x) return x if __name__ == "__main__": H, W = 1200//32, 1600//32 input_channels = 3 + 64 output_channels = 3 input = torch.randn(1, input_channels, H, W).cuda() model = AppearanceNetwork(input_channels, output_channels).cuda() output = model(input) print(output.shape) ================================================ FILE: scene/cameras.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import torch from torch import nn import numpy as np from tools.graphics_utils import getWorld2View2, getProjectionMatrix, getIntrinsic class Camera(nn.Module): def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask, image_name, uid, depth=None, normal=None, mask=None, trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda" ): super(Camera, self).__init__() self.uid = uid self.colmap_id = colmap_id self.R = R self.T = T self.FoVx = FoVx self.FoVy = FoVy self.image_name = image_name try: self.data_device = torch.device(data_device) except Exception as e: print(e) print(f"[Warning] Custom device {data_device} failed, fallback to default cuda device" ) self.data_device = torch.device("cuda") self.original_image = image.clamp(0.0, 1.0).to(self.data_device) self.image_width = self.original_image.shape[2] self.image_height = self.original_image.shape[1] if gt_alpha_mask is not None: self.gt_alpha_mask = gt_alpha_mask if mask is not None: mask = mask.squeeze(-1).cuda() mask[self.gt_alpha_mask[0] == 0] = 0 else: mask = self.gt_alpha_mask.bool().squeeze(0).cuda() else: self.original_image *= torch.ones((1, self.image_height, self.image_width), device=self.data_device) self.gt_alpha_mask = None self.depth = depth.to(data_device) if depth is not None else None self.normal = normal.to(data_device) if normal is not None else None if mask is not None: self.mask = mask.squeeze(-1).cuda() self.zfar = 100.0 self.znear = 0.01 self.trans = trans self.scale = scale self.world_view_transform = torch.tensor(getWorld2View2(R, T, trans, scale)).transpose(0, 1).cuda() # w2c self.projection_matrix = getProjectionMatrix(znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy).transpose(0,1).cuda() self.full_proj_transform = (self.world_view_transform.unsqueeze(0).bmm(self.projection_matrix.unsqueeze(0))).squeeze(0) # w2c2image self.camera_center = self.world_view_transform.inverse()[3, :3] intr = getIntrinsic(self.FoVx, self.FoVy, self.image_height, self.image_width).cuda() self.intr = intr class MiniCam: def __init__(self, width, height, fovy, fovx, znear, zfar, world_view_transform, full_proj_transform): self.image_width = width self.image_height = height self.FoVy = fovy self.FoVx = fovx self.znear = znear self.zfar = zfar self.world_view_transform = world_view_transform self.full_proj_transform = full_proj_transform view_inv = torch.inverse(self.world_view_transform) self.camera_center = view_inv[3][:3] class SampleCam(nn.Module): def __init__(self, w2c, width, height, FoVx, FoVy, device='cuda'): super(SampleCam, self).__init__() self.FoVx = FoVx self.FoVy = FoVy self.image_width = width self.image_height = height self.zfar = 100.0 self.znear = 0.01 try: self.data_device = torch.device(device) except Exception as e: print(e) print(f"[Warning] Custom device {device} failed, fallback to default cuda device" ) self.data_device = torch.device("cuda") w2c = w2c.to(self.data_device) self.world_view_transform = w2c.transpose(0, 1) self.projection_matrix = getProjectionMatrix(znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy).transpose(0,1).to(w2c.device) self.full_proj_transform = self.world_view_transform @ self.projection_matrix self.camera_center = self.world_view_transform.inverse()[3, :3] class MiniCam2: def __init__(self, c2w, width, height, fovy, fovx, znear, zfar): # c2w (pose) should be in NeRF convention. self.image_width = width self.image_height = height self.FoVy = fovy self.FoVx = fovx self.znear = znear self.zfar = zfar w2c = np.linalg.inv(c2w) # rectify... w2c[1:3, :3] *= -1 w2c[:3, 3] *= -1 self.world_view_transform = torch.tensor(w2c).transpose(0, 1).cuda() self.projection_matrix = ( getProjectionMatrix( znear=self.znear, zfar=self.zfar, fovX=self.FoVx, fovY=self.FoVy ) .transpose(0, 1) .cuda() ) self.full_proj_transform = self.world_view_transform @ self.projection_matrix self.camera_center = -torch.tensor(c2w[:3, 3]).cuda() ================================================ FILE: scene/colmap_loader.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import numpy as np import collections import struct CameraModel = collections.namedtuple( "CameraModel", ["model_id", "model_name", "num_params"]) Camera = collections.namedtuple( "Camera", ["id", "model", "width", "height", "params"]) BaseImage = collections.namedtuple( "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]) Point3D = collections.namedtuple( "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]) CAMERA_MODELS = { CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3), CameraModel(model_id=1, model_name="PINHOLE", num_params=4), CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4), CameraModel(model_id=3, model_name="RADIAL", num_params=5), CameraModel(model_id=4, model_name="OPENCV", num_params=8), CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8), CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12), CameraModel(model_id=7, model_name="FOV", num_params=5), CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4), CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5), CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12) } CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) for camera_model in CAMERA_MODELS]) CAMERA_MODEL_NAMES = dict([(camera_model.model_name, camera_model) for camera_model in CAMERA_MODELS]) def qvec2rotmat(qvec): return np.array([ [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2, 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]], [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2, 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]], [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]]) def rotmat2qvec(R): Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat K = np.array([ [Rxx - Ryy - Rzz, 0, 0, 0], [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0], [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0], [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0 eigvals, eigvecs = np.linalg.eigh(K) qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)] if qvec[0] < 0: qvec *= -1 return qvec class Image(BaseImage): def qvec2rotmat(self): return qvec2rotmat(self.qvec) def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"): """Read and unpack the next bytes from a binary file. :param fid: :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc. :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. :param endian_character: Any of {@, =, <, >, !} :return: Tuple of read and unpacked values. """ data = fid.read(num_bytes) return struct.unpack(endian_character + format_char_sequence, data) def read_points3D_text(path): """ see: src/base/reconstruction.cc void Reconstruction::ReadPoints3DText(const std::string& path) void Reconstruction::WritePoints3DText(const std::string& path) """ xyzs = None rgbs = None errors = None num_points = 0 with open(path, "r") as fid: while True: line = fid.readline() if not line: break line = line.strip() if len(line) > 0 and line[0] != "#": num_points += 1 xyzs = np.empty((num_points, 3)) rgbs = np.empty((num_points, 3)) errors = np.empty((num_points, 1)) count = 0 with open(path, "r") as fid: while True: line = fid.readline() if not line: break line = line.strip() if len(line) > 0 and line[0] != "#": elems = line.split() xyz = np.array(tuple(map(float, elems[1:4]))) rgb = np.array(tuple(map(int, elems[4:7]))) error = np.array(float(elems[7])) xyzs[count] = xyz rgbs[count] = rgb errors[count] = error count += 1 return xyzs, rgbs, errors def read_points3D_binary(path_to_model_file): """ see: src/base/reconstruction.cc void Reconstruction::ReadPoints3DBinary(const std::string& path) void Reconstruction::WritePoints3DBinary(const std::string& path) """ with open(path_to_model_file, "rb") as fid: num_points = read_next_bytes(fid, 8, "Q")[0] xyzs = np.empty((num_points, 3)) rgbs = np.empty((num_points, 3)) errors = np.empty((num_points, 1)) for p_id in range(num_points): binary_point_line_properties = read_next_bytes( fid, num_bytes=43, format_char_sequence="QdddBBBd") xyz = np.array(binary_point_line_properties[1:4]) rgb = np.array(binary_point_line_properties[4:7]) error = np.array(binary_point_line_properties[7]) track_length = read_next_bytes( fid, num_bytes=8, format_char_sequence="Q")[0] track_elems = read_next_bytes( fid, num_bytes=8*track_length, format_char_sequence="ii"*track_length) xyzs[p_id] = xyz rgbs[p_id] = rgb errors[p_id] = error return xyzs, rgbs, errors def read_intrinsics_text(path): """ Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py """ cameras = {} with open(path, "r") as fid: while True: line = fid.readline() if not line: break line = line.strip() if len(line) > 0 and line[0] != "#": elems = line.split() camera_id = int(elems[0]) model = elems[1] assert model == "PINHOLE", "While the loader support other types, the rest of the code assumes PINHOLE" width = int(elems[2]) height = int(elems[3]) params = np.array(tuple(map(float, elems[4:]))) cameras[camera_id] = Camera(id=camera_id, model=model, width=width, height=height, params=params) return cameras def read_extrinsics_binary(path_to_model_file): """ see: src/base/reconstruction.cc void Reconstruction::ReadImagesBinary(const std::string& path) void Reconstruction::WriteImagesBinary(const std::string& path) """ images = {} with open(path_to_model_file, "rb") as fid: num_reg_images = read_next_bytes(fid, 8, "Q")[0] for _ in range(num_reg_images): binary_image_properties = read_next_bytes( fid, num_bytes=64, format_char_sequence="idddddddi") image_id = binary_image_properties[0] qvec = np.array(binary_image_properties[1:5]) tvec = np.array(binary_image_properties[5:8]) camera_id = binary_image_properties[8] image_name = "" current_char = read_next_bytes(fid, 1, "c")[0] while current_char != b"\x00": # look for the ASCII 0 entry image_name += current_char.decode("utf-8") current_char = read_next_bytes(fid, 1, "c")[0] num_points2D = read_next_bytes(fid, num_bytes=8, format_char_sequence="Q")[0] x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D, format_char_sequence="ddq"*num_points2D) xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])), tuple(map(float, x_y_id_s[1::3]))]) point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3]))) images[image_id] = Image( id=image_id, qvec=qvec, tvec=tvec, camera_id=camera_id, name=image_name, xys=xys, point3D_ids=point3D_ids) return images def read_intrinsics_binary(path_to_model_file): """ see: src/base/reconstruction.cc void Reconstruction::WriteCamerasBinary(const std::string& path) void Reconstruction::ReadCamerasBinary(const std::string& path) """ cameras = {} with open(path_to_model_file, "rb") as fid: num_cameras = read_next_bytes(fid, 8, "Q")[0] for _ in range(num_cameras): camera_properties = read_next_bytes( fid, num_bytes=24, format_char_sequence="iiQQ") camera_id = camera_properties[0] model_id = camera_properties[1] model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name width = camera_properties[2] height = camera_properties[3] num_params = CAMERA_MODEL_IDS[model_id].num_params params = read_next_bytes(fid, num_bytes=8*num_params, format_char_sequence="d"*num_params) cameras[camera_id] = Camera(id=camera_id, model=model_name, width=width, height=height, params=np.array(params)) assert len(cameras) == num_cameras return cameras def read_extrinsics_text(path): """ Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_write_model.py """ images = {} with open(path, "r") as fid: while True: line = fid.readline() if not line: break line = line.strip() if len(line) > 0 and line[0] != "#": elems = line.split() image_id = int(elems[0]) qvec = np.array(tuple(map(float, elems[1:5]))) tvec = np.array(tuple(map(float, elems[5:8]))) camera_id = int(elems[8]) image_name = elems[9] elems = fid.readline().split() xys = np.column_stack([tuple(map(float, elems[0::3])), tuple(map(float, elems[1::3]))]) point3D_ids = np.array(tuple(map(int, elems[2::3]))) images[image_id] = Image( id=image_id, qvec=qvec, tvec=tvec, camera_id=camera_id, name=image_name, xys=xys, point3D_ids=point3D_ids) return images def read_colmap_bin_array(path): """ Taken from https://github.com/colmap/colmap/blob/dev/scripts/python/read_dense.py :param path: path to the colmap binary file. :return: nd array with the floating point values in the value """ with open(path, "rb") as fid: width, height, channels = np.genfromtxt(fid, delimiter="&", max_rows=1, usecols=(0, 1, 2), dtype=int) fid.seek(0) num_delimiter = 0 byte = fid.read(1) while True: if byte == b"&": num_delimiter += 1 if num_delimiter >= 3: break byte = fid.read(1) array = np.fromfile(fid, np.float32) array = array.reshape((width, height, channels), order="F") return np.transpose(array, (1, 0, 2)).squeeze() ================================================ FILE: scene/dataset_readers.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os import sys import cv2 import json import numpy as np import open3d as o3d from PIL import Image, ImageFile from pathlib import Path from typing import NamedTuple from plyfile import PlyData, PlyElement ImageFile.LOAD_TRUNCATED_IMAGES = True from scene.colmap_loader import read_extrinsics_text, read_intrinsics_text, qvec2rotmat, \ read_extrinsics_binary, read_intrinsics_binary, read_points3D_binary, read_points3D_text from tools.graphics_utils import getWorld2View2, focal2fov, fov2focal from tools.sh_utils import SH2RGB from scene.gaussian_model import BasicPointCloud from tools.math_utils import normalize_pts from process_data.convert_data_to_json import bound_by_points class CameraInfo(NamedTuple): uid: int R: np.array T: np.array FovY: np.array FovX: np.array image: np.array image_path: str image_name: str width: int height: int depth: None normal: None mask: None class SceneInfo(NamedTuple): point_cloud: BasicPointCloud train_cameras: list test_cameras: list nerf_normalization: dict ply_path: str trans: np.array scale: np.array first_name: str def getNerfppNorm(cam_info): def get_center_and_diag(cam_centers): cam_centers = np.hstack(cam_centers) avg_cam_center = np.mean(cam_centers, axis=1, keepdims=True) center = avg_cam_center dist = np.linalg.norm(cam_centers - center, axis=0, keepdims=True) diagonal = np.max(dist) return center.flatten(), diagonal cam_centers = [] for cam in cam_info: W2C = getWorld2View2(cam.R, cam.T) C2W = np.linalg.inv(W2C) cam_centers.append(C2W[:3, 3:4]) center, diagonal = get_center_and_diag(cam_centers) radius = diagonal * 1.1 translate = -center return {"translate": translate, "radius": radius} def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder, load_depth=False, load_normal=False, load_mask=False, normal_folder='normals', depth_folder='depths'): if load_depth: depths_folder = images_folder.replace('images', depth_folder) if load_normal: normals_folder = images_folder.replace('images', normal_folder) if load_mask: mask_folder = images_folder.replace('images', 'masks') cam_infos = [] for idx, key in enumerate(cam_extrinsics): sys.stdout.write('\r') # the exact output you're looking for: sys.stdout.write("Reading camera {}/{}".format(idx+1, len(cam_extrinsics))) sys.stdout.flush() extr = cam_extrinsics[key] intr = cam_intrinsics[extr.camera_id] height = intr.height width = intr.width uid = intr.id R = np.transpose(qvec2rotmat(extr.qvec)) T = np.array(extr.tvec) if intr.model=="SIMPLE_PINHOLE": focal_length_x = intr.params[0] FovY = focal2fov(focal_length_x, height) FovX = focal2fov(focal_length_x, width) elif intr.model=="PINHOLE": focal_length_x = intr.params[0] focal_length_y = intr.params[1] FovY = focal2fov(focal_length_y, height) FovX = focal2fov(focal_length_x, width) else: assert False, "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!" image_path = os.path.join(images_folder, os.path.basename(extr.name)) image_name = os.path.basename(image_path).split(".")[0] image = Image.open(image_path) depth = None if load_depth: depth_path = os.path.join(depths_folder, os.path.basename(extr.name).replace('jpg', 'npz').replace('png', 'npz')) if os.path.exists(depth_path): depth = np.load(depth_path)['arr_0'] else: depth_path = os.path.join(depths_folder, os.path.basename(extr.name).replace('jpg', 'png')) depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED) if depth.ndim == 2: depth = depth[..., None] normal = None if load_normal: normal_path = os.path.join(normals_folder, os.path.basename(extr.name).replace('png', 'npz').replace('jpg', 'npz').replace('JPG', 'npz')) normal = np.load(normal_path)['arr_0'] # -1, 1 mask = None if load_mask: mask_path = os.path.join(mask_folder, os.path.basename(extr.name).replace('jpg', 'png')) mask_path = mask_path if os.path.exists(mask_path) else \ os.path.join(mask_folder, os.path.basename(extr.name)[1:]) mask = Image.open(mask_path) cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=image, image_path=image_path, image_name=image_name, width=width, height=height, depth=depth, normal=normal, mask=mask) cam_infos.append(cam_info) sys.stdout.write('\n') return cam_infos def fetchPly(path): plydata = PlyData.read(path) vertices = plydata['vertex'] positions = np.vstack([vertices['x'], vertices['y'], vertices['z']]).T colors = np.vstack([vertices['red'], vertices['green'], vertices['blue']]).T / 255.0 normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T return BasicPointCloud(points=positions, colors=colors, normals=normals) def storePly(path, xyz, rgb, normals=None): # Define the dtype for the structured array dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')] normals = np.zeros_like(xyz) if normals is None else normals elements = np.empty(xyz.shape[0], dtype=dtype) attributes = np.concatenate((xyz, normals, rgb), axis=1) elements[:] = list(map(tuple, attributes)) # Create the PlyData object and write to file vertex_element = PlyElement.describe(elements, 'vertex') ply_data = PlyData([vertex_element]) ply_data.write(path) def get_inside_mask(pts, trans, scale): pts = normalize_pts(pts, trans, scale) inside = np.all(np.abs(pts) < 1.5, axis=-1) return inside def filter_point_cloud(trans, scale, xyz, rgb, nb_points=5, radius=0.1): inside = get_inside_mask(xyz, trans, scale) xyz_inside = xyz[inside] rgb_inside = rgb[inside] xyz_outside = xyz[~inside] rgb_outside = rgb[~inside] pcd_inside = o3d.geometry.PointCloud() pcd_inside.points = o3d.utility.Vector3dVector(xyz_inside) pcd_inside.colors = o3d.utility.Vector3dVector(rgb_inside) pcd_inside_filter, ind = pcd_inside.remove_radius_outlier(nb_points, radius) xyz_inside = np.asarray(pcd_inside_filter.points) rgb_inside = np.asarray(pcd_inside_filter.colors) xyz = np.concatenate((xyz_inside, xyz_outside), axis=0) rgb = np.concatenate((rgb_inside, rgb_outside), axis=0) return xyz, rgb def readColmapSceneInfo(path, images, eval, llffhold=8, ratio=0, split=False, load_depth=False, load_normal=False, load_mask=False, normal_folder='normals', depth_folder='depths'): colmap_dir = os.path.join(path, "sparse/0") if not os.path.exists(colmap_dir): colmap_dir = os.path.join(path, "sparse") try: cameras_extrinsic_file = os.path.join(colmap_dir, "images.bin") cameras_intrinsic_file = os.path.join(colmap_dir, "cameras.bin") cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file) cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file) except: cameras_extrinsic_file = os.path.join(colmap_dir, "images.txt") cameras_intrinsic_file = os.path.join(colmap_dir, "cameras.txt") cam_extrinsics = read_extrinsics_text(cameras_extrinsic_file) cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file) ply_path = os.path.join(colmap_dir, "points3D.ply") bin_path = os.path.join(colmap_dir, "points3D.bin") txt_path = os.path.join(colmap_dir, "points3D.txt") reading_dir = "images" if images == None else images cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics, images_folder=os.path.join(path, reading_dir), load_depth=load_depth, load_normal=load_normal, load_mask=load_mask, normal_folder=normal_folder, depth_folder=depth_folder) cam_infos = sorted(cam_infos_unsorted.copy(), key = lambda x : x.image_name) meta_fname = f"{path}/meta.json" if os.path.exists(meta_fname): with open(meta_fname) as file: meta = json.load(file) trans = np.array(meta["trans"], dtype=np.float32) scale = np.array(meta["scale"], dtype=np.float32) else: print("No meta.json file found, using default values.") if not os.path.exists(ply_path): print("Converting point3d.bin to .ply, will happen only the first time you open the scene.") try: xyz, rgb, _ = read_points3D_binary(bin_path) except: xyz, rgb, _ = read_points3D_text(txt_path) # xyz, rgb = filter_point_cloud(trans, scale, xyz, rgb) # storePly(ply_path, xyz, rgb) # try: # pcd = fetchPly(ply_path) # except: # pcd = None trans, scale, bounding_box = bound_by_points(xyz) meta = { 'trans': trans.tolist(), 'scale': scale.tolist() } with open(meta_fname, "w") as file: json.dump(meta, file, indent=4) if ratio > 0: len_train = int(len(cam_infos) * ratio) llffhold = len(cam_infos) // len_train train_idx = set([int(i * llffhold) for i in range(len_train)]) test_idx = set(range(len(cam_infos))) - train_idx train_cam_infos = [cam_infos[i] for i in train_idx] test_cam_infos = [cam_infos[i] for i in test_idx] elif eval: if split and "test" in meta: train_cam_infos = [c for c in cam_infos if c.image_name in meta["train"]] test_cam_infos = [c for c in cam_infos if c.image_name in meta["test"]] else: train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0] test_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold == 0] else: train_cam_infos = cam_infos test_cam_infos = [] print(f"Train: {len(train_cam_infos)}, Test: {len(test_cam_infos)}") first_name = test_cam_infos[0].image_name if eval else cam_infos[0].image_name nerf_normalization = getNerfppNorm(train_cam_infos) if not os.path.exists(ply_path): print("Converting point3d.bin to .ply, will happen only the first time you open the scene.") try: xyz, rgb, _ = read_points3D_binary(bin_path) except: xyz, rgb, _ = read_points3D_text(txt_path) xyz, rgb = filter_point_cloud(trans, scale, xyz, rgb) storePly(ply_path, xyz, rgb) try: pcd = fetchPly(ply_path) except: pcd = None scene_info = SceneInfo(point_cloud=pcd, train_cameras=train_cam_infos, test_cameras=test_cam_infos, nerf_normalization=nerf_normalization, ply_path=ply_path, trans=trans, scale=scale, first_name=first_name) return scene_info def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png"): cam_infos = [] with open(os.path.join(path, transformsfile)) as json_file: contents = json.load(json_file) fovx = contents["camera_angle_x"] frames = contents["frames"] for idx, frame in enumerate(frames): cam_name = os.path.join(path, frame["file_path"] + extension) # NeRF 'transform_matrix' is a camera-to-world transform c2w = np.array(frame["transform_matrix"]) # change from OpenGL/Blender camera axes (Y up, Z back) to COLMAP (Y down, Z forward) c2w[:3, 1:3] *= -1 # get the world-to-camera transform and set R, T w2c = np.linalg.inv(c2w) R = np.transpose(w2c[:3,:3]) # R is stored transposed due to 'glm' in CUDA code T = w2c[:3, 3] image_path = os.path.join(path, cam_name) image_name = Path(cam_name).stem image = Image.open(image_path) im_data = np.array(image.convert("RGBA")) bg = np.array([1,1,1]) if white_background else np.array([0, 0, 0]) norm_data = im_data / 255.0 arr = norm_data[:,:,:3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4]) image = Image.fromarray(np.array(arr*255.0, dtype=np.byte), "RGB") fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1]) FovY = fovy FovX = fovx cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image, image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1])) return cam_infos def readNerfSyntheticInfo(path, white_background, eval, extension=".png"): print("Reading Training Transforms") train_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension) print("Reading Test Transforms") test_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension) if not eval: train_cam_infos.extend(test_cam_infos) test_cam_infos = [] nerf_normalization = getNerfppNorm(train_cam_infos) ply_path = os.path.join(path, "points3d.ply") if not os.path.exists(ply_path): # Since this data set has no colmap data, we start with random points num_pts = 100_000 print(f"Generating random point cloud ({num_pts})...") # We create random points inside the bounds of the synthetic Blender scenes xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3 shs = np.random.random((num_pts, 3)) / 255.0 pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3))) storePly(ply_path, xyz, SH2RGB(shs) * 255) try: pcd = fetchPly(ply_path) except: pcd = None scene_info = SceneInfo(point_cloud=pcd, train_cameras=train_cam_infos, test_cameras=test_cam_infos, nerf_normalization=nerf_normalization, ply_path=ply_path) return scene_info sceneLoadTypeCallbacks = { "Colmap": readColmapSceneInfo, "Blender" : readNerfSyntheticInfo } ================================================ FILE: scene/gaussian_model.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import os import torch import numpy as np from torch import nn from copy import deepcopy try: from simple_knn._C import distCUDA2 except ModuleNotFoundError: pass from plyfile import PlyData, PlyElement from io import BytesIO from tqdm import trange from tools.sh_utils import RGB2SH from tools.system_utils import mkdir_p from tools.graphics_utils import BasicPointCloud from tools.math_utils import normalize_pts, get_inside_normalized from tools.general_utils import strip_symmetric, build_scaling_rotation from tools.general_utils import inverse_sigmoid, get_expon_lr_func, build_rotation from tools.denoise_pcd import remove_radius_outlier from scene.appearance_network import AppearanceNetwork from tools.semantic_id import BACKGROUND class GaussianModel: def setup_functions(self): def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation): L = build_scaling_rotation(scaling_modifier * scaling, rotation) actual_covariance = L @ L.transpose(1, 2) symm = strip_symmetric(actual_covariance) return symm self.scaling_activation = torch.exp self.scaling_inverse_activation = torch.log self.covariance_activation = build_covariance_from_scaling_rotation self.opacity_activation = torch.sigmoid self.inverse_opacity_activation = inverse_sigmoid self.rotation_activation = torch.nn.functional.normalize def __init__(self, cfg): self.active_sh_degree = 0 self.max_sh_degree = cfg.sh_degree self._xyz = torch.empty(0) self._features_dc = torch.empty(0) self._features_rest = torch.empty(0) self._scaling = torch.empty(0) self._rotation = torch.empty(0) self._opacity = torch.empty(0) self.max_radii2D = torch.empty(0) self.xyz_gradient_accum = torch.empty(0) self.denom = torch.empty(0) self.optimizer = None self.percent_dense = 0 self.spatial_lr_scale = 0 self.setup_functions() self.max_mem = cfg.max_mem self.use_decoupled_appearance = cfg.use_decoupled_appearance if self.use_decoupled_appearance: # appearance network and appearance embedding self.appearance_network = AppearanceNetwork(3+64, 3).cuda() std = 1e-4 num_embedding = len(os.listdir(os.path.join(cfg.source_path, 'images'))) self._appearance_embeddings = nn.Parameter(torch.empty(num_embedding, 64).cuda()) self._appearance_embeddings.data.normal_(0, std) self.enable_semantic = cfg.enable_semantic self._objects_dc = torch.empty(0) if self.enable_semantic: self.ch_sem_feat = cfg.ch_sem_feat self.num_cls = cfg.num_cls self.classifier = torch.nn.Conv2d(self.ch_sem_feat, self.num_cls, kernel_size=1).cuda() def capture(self): return ( self.active_sh_degree, self._xyz, self._features_dc, self._features_rest, self._scaling, self._rotation, self._opacity, self._objects_dc, self.max_radii2D, self.xyz_gradient_accum, self.denom, self.optimizer.state_dict(), self.spatial_lr_scale, ) def restore(self, model_args, training_args): (self.active_sh_degree, self._xyz, self._features_dc, self._features_rest, self._scaling, self._rotation, self._opacity, self._objects_dc, self.max_radii2D, xyz_gradient_accum, denom, opt_dict, self.spatial_lr_scale, ) = model_args self.training_setup(training_args) self.xyz_gradient_accum = xyz_gradient_accum self.denom = denom self.optimizer.load_state_dict(opt_dict) @property def get_scaling(self): scaling = self._scaling return self.scaling_activation(scaling) @property def get_rotation(self): return self.rotation_activation(self._rotation) @property def get_xyz(self): return self._xyz @property def get_features(self): features_dc = self._features_dc features_rest = self._features_rest return torch.cat((features_dc, features_rest), dim=1) @property def get_objects(self): return self._objects_dc def get_cls(self, idx=None): assert self.enable_semantic, "Semantic feature is not enabled" feats = self.get_objects.permute(0, 2, 1)[..., None] if idx is not None: feats = feats[idx] return self.classifier(feats).view(-1, self.num_cls).argmax(-1) def logits_2_label(self, logits): return torch.argmax(self.logits2prob(logits), dim=-1) def logits2prob(self, logits): return torch.nn.functional.softmax(logits, dim=-1) @property def get_opacity(self): return self.opacity_activation(self._opacity) def get_apperance_embedding(self, idx): return self._appearance_embeddings[idx] # @property def get_normal(self, valid=None, idx=None, refine_sign=True, is_all=False): ''' rots: N, 3, 3 ''' normal = None if valid is None: if is_all: valid = torch.ones(self.get_xyz.shape[0], device='cuda', dtype=torch.bool) else: valid = self.get_inside_gaus_normalized()[0] normal = torch.zeros_like(self.get_xyz) _rot = self.get_rotation[valid] if idx is not None: _rot = _rot[idx] rots = build_rotation(_rot) scaling = self.get_scaling[valid] if idx is not None: scaling = scaling[idx] axis = torch.argmin(scaling, dim=-1) normals = rots.gather(2, axis[:, None, None].expand(-1, 3, -1)).squeeze(-1) if normal is not None: normal[valid] = normals normals = normal return normals def get_covariance(self, scaling_modifier = 1): return self.covariance_activation(self.get_scaling, scaling_modifier, self._rotation) def oneupSHdegree(self): if self.active_sh_degree < self.max_sh_degree: self.active_sh_degree += 1 def create_from_pcd(self, pcd : BasicPointCloud, spatial_lr_scale : float): self.spatial_lr_scale = spatial_lr_scale fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda() fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda()) features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda() features[:, :3, 0 ] = fused_color features[:, 3:, 1:] = 0.0 print("Number of points at initialisation : ", fused_point_cloud.shape[0]) dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001) scales = torch.log(torch.sqrt(dist2))[...,None].repeat(1, 3) rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda") rots[:, 0] = 1 opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda")) self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True)) self._features_dc = nn.Parameter(features[:,:,0:1].transpose(1, 2).contiguous().requires_grad_(True)) self._features_rest = nn.Parameter(features[:,:,1:].transpose(1, 2).contiguous().requires_grad_(True)) self._scaling = nn.Parameter(scales.requires_grad_(True)) self._rotation = nn.Parameter(rots.requires_grad_(True)) self._opacity = nn.Parameter(opacities.requires_grad_(True)) self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") if self.enable_semantic: # random init obj_id now fused_objects = RGB2SH(torch.rand((fused_point_cloud.shape[0], self.ch_sem_feat), device="cuda")) fused_objects = fused_objects[:,:,None] self._objects_dc = nn.Parameter(fused_objects.transpose(1, 2).contiguous().requires_grad_(True)) def training_setup(self, training_args, neural_sdf_params=None): self.percent_dense = training_args.percent_dense self.large_percent_dense = None if hasattr(training_args, 'densify_large'): self.large_percent_dense = training_args.densify_large.percent_dense if \ getattr(training_args.densify_large, 'percent_dense', 0) > 0 else None self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") l = [ {'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"}, {'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"}, {'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"}, {'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"}, {'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"}, {'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"}, ] if self.use_decoupled_appearance: l.append({'params': [self._appearance_embeddings], 'lr': training_args.appearance_embeddings_lr, "name": "appearance_embeddings"}) l.append({'params': self.appearance_network.parameters(), 'lr': training_args.appearance_network_lr, "name": "appearance_network"}) if self.enable_semantic: l.append({'params': [self._objects_dc], 'lr': training_args.feature_lr, "name": "obj_dc"}) l.append({'params': self.classifier.parameters(), 'lr': training_args.cls_lr, "name": "classifier"}) if neural_sdf_params is not None: l.append({'params': neural_sdf_params.parameters(), 'lr': training_args.sdf_lr, "name": "neural_sdf"}) self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15) self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale, lr_final=training_args.position_lr_final*self.spatial_lr_scale, lr_delay_mult=training_args.position_lr_delay_mult, max_steps=training_args.position_lr_max_steps) def update_learning_rate(self, iteration): ''' Learning rate scheduling per step ''' for param_group in self.optimizer.param_groups: if param_group["name"] == "xyz": lr = self.xyz_scheduler_args(iteration) param_group['lr'] = lr return lr def construct_list_of_attributes(self): l = ['x', 'y', 'z', 'nx', 'ny', 'nz'] # All channels except the 3 DC for i in range(self._features_dc.shape[1]*self._features_dc.shape[2]): l.append('f_dc_{}'.format(i)) for i in range(self._features_rest.shape[1]*self._features_rest.shape[2]): l.append('f_rest_{}'.format(i)) l.append('opacity') for i in range(self._scaling.shape[1]): l.append('scale_{}'.format(i)) for i in range(self._rotation.shape[1]): l.append('rot_{}'.format(i)) if self.enable_semantic: for i in range(self._objects_dc.shape[1]*self._objects_dc.shape[2]): l.append('obj_dc_{}'.format(i)) return l def save_ply(self, path): mkdir_p(os.path.dirname(path)) xyz = self._xyz.detach().cpu().numpy() normals = np.zeros_like(xyz) f_dc = self._features_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() f_rest = self._features_rest.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() opacities = self._opacity.detach().cpu().numpy() scale = self._scaling.detach().cpu().numpy() rotation = self._rotation.detach().cpu().numpy() if self.enable_semantic: obj_dc = self._objects_dc.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()] elements = np.empty(xyz.shape[0], dtype=dtype_full) attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1) if self.enable_semantic: attributes = np.concatenate((attributes, obj_dc), axis=1) elements[:] = list(map(tuple, attributes)) el = PlyElement.describe(elements, 'vertex') PlyData([el]).write(path) state_dict = {} if self.use_decoupled_appearance: state_dict["appearance_network"] = self.appearance_network.state_dict() state_dict["appearance_embeddings"] = self._appearance_embeddings if self.enable_semantic: state_dict["classifier"] = self.classifier.state_dict() if len(state_dict) > 0: torch.save(state_dict, os.path.join(os.path.dirname(path), 'model.pth')) @torch.no_grad() def save_inside_ply(self, path, inside=None): mkdir_p(os.path.dirname(path)) if inside is None: inside = self.get_inside_gaus_normalized()[0] xyz = self._xyz[inside].detach() _normals = self.get_normal(inside, refine_sign=True).detach() normals = _normals inside = inside.cpu().numpy() xyz = xyz.cpu().numpy() normals = normals.cpu().numpy() f_dc = self._features_dc[inside].detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() f_rest = self._features_rest[inside].detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() opacities = self._opacity[inside].detach().cpu().numpy() scale = self._scaling[inside].detach().cpu().numpy() rotation = self._rotation[inside].detach().cpu().numpy() if self.enable_semantic: obj_dc = self._objects_dc[inside].detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy() dtype_full = [(attribute, 'f4') for attribute in self.construct_list_of_attributes()] elements = np.empty(xyz.shape[0], dtype=dtype_full) attributes = np.concatenate((xyz, normals, f_dc, f_rest, opacities, scale, rotation), axis=1) if self.enable_semantic: attributes = np.concatenate((attributes, obj_dc), axis=1) elements[:] = list(map(tuple, attributes)) el = PlyElement.describe(elements, 'vertex') PlyData([el]).write(path) def save_visi_ply(self, path, visi): inside = self.get_inside_gaus_normalized()[0] inside = inside & visi self.save_inside_ply(path, inside) def reset_opacity(self): opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.01)) optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity") self._opacity = optimizable_tensors["opacity"] def load_ply(self, path): plydata = PlyData.read(path) xyz = np.stack((np.asarray(plydata.elements[0]["x"]), np.asarray(plydata.elements[0]["y"]), np.asarray(plydata.elements[0]["z"])), axis=1) opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis] features_dc = np.zeros((xyz.shape[0], 3, 1)) features_dc[:, 0, 0] = np.asarray(plydata.elements[0]["f_dc_0"]) features_dc[:, 1, 0] = np.asarray(plydata.elements[0]["f_dc_1"]) features_dc[:, 2, 0] = np.asarray(plydata.elements[0]["f_dc_2"]) extra_f_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("f_rest_")] extra_f_names = sorted(extra_f_names, key = lambda x: int(x.split('_')[-1])) assert len(extra_f_names)==3*(self.max_sh_degree + 1) ** 2 - 3 features_extra = np.zeros((xyz.shape[0], len(extra_f_names))) for idx, attr_name in enumerate(extra_f_names): features_extra[:, idx] = np.asarray(plydata.elements[0][attr_name]) # Reshape (P,F*SH_coeffs) to (P, F, SH_coeffs except DC) features_extra = features_extra.reshape((features_extra.shape[0], 3, (self.max_sh_degree + 1) ** 2 - 1)) scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")] scale_names = sorted(scale_names, key = lambda x: int(x.split('_')[-1])) scales = np.zeros((xyz.shape[0], len(scale_names))) for idx, attr_name in enumerate(scale_names): scales[:, idx] = np.asarray(plydata.elements[0][attr_name]) rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot")] rot_names = sorted(rot_names, key = lambda x: int(x.split('_')[-1])) rots = np.zeros((xyz.shape[0], len(rot_names))) for idx, attr_name in enumerate(rot_names): rots[:, idx] = np.asarray(plydata.elements[0][attr_name]) if self.enable_semantic: objects_dc = np.zeros((xyz.shape[0], self.ch_sem_feat, 1)) for idx in range(self.ch_sem_feat): objects_dc[:,idx,0] = np.asarray(plydata.elements[0]["obj_dc_"+str(idx)]) self._xyz = nn.Parameter(torch.tensor(xyz, dtype=torch.float, device="cuda").requires_grad_(True)) self._features_dc = nn.Parameter(torch.tensor(features_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) self._features_rest = nn.Parameter(torch.tensor(features_extra, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) self._opacity = nn.Parameter(torch.tensor(opacities, dtype=torch.float, device="cuda").requires_grad_(True)) self._scaling = nn.Parameter(torch.tensor(scales, dtype=torch.float, device="cuda").requires_grad_(True)) self._rotation = nn.Parameter(torch.tensor(rots, dtype=torch.float, device="cuda").requires_grad_(True)) if self.enable_semantic: self._objects_dc = nn.Parameter(torch.tensor(objects_dc, dtype=torch.float, device="cuda").transpose(1, 2).contiguous().requires_grad_(True)) self.active_sh_degree = self.max_sh_degree ckpt_path = os.path.join(os.path.dirname(path), 'model.pth') if os.path.exists(ckpt_path): state_dict = torch.load(ckpt_path) if self.enable_semantic: self.classifier.load_state_dict(state_dict["classifier"]) if self.use_decoupled_appearance: self.appearance_network.load_state_dict(state_dict["appearance_network"]) self._appearance_embeddings = nn.Parameter(state_dict["appearance_embeddings"].cuda()) def replace_tensor_to_optimizer(self, tensor, name): optimizable_tensors = {} for group in self.optimizer.param_groups: if group["name"] in ["appearance_embeddings", "appearance_network", "classifier"]: continue if group["name"] == name: stored_state = self.optimizer.state.get(group['params'][0], None) stored_state["exp_avg"] = torch.zeros_like(tensor) stored_state["exp_avg_sq"] = torch.zeros_like(tensor) del self.optimizer.state[group['params'][0]] group["params"][0] = nn.Parameter(tensor.requires_grad_(True)) self.optimizer.state[group['params'][0]] = stored_state optimizable_tensors[group["name"]] = group["params"][0] return optimizable_tensors def _prune_optimizer(self, mask): optimizable_tensors = {} for group in self.optimizer.param_groups: if group["name"] in ["appearance_embeddings", "appearance_network", "classifier"]: continue stored_state = self.optimizer.state.get(group['params'][0], None) if stored_state is not None: stored_state["exp_avg"] = stored_state["exp_avg"][mask] stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask] del self.optimizer.state[group['params'][0]] group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True))) self.optimizer.state[group['params'][0]] = stored_state optimizable_tensors[group["name"]] = group["params"][0] else: group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True)) optimizable_tensors[group["name"]] = group["params"][0] return optimizable_tensors def prune_points(self, mask): valid_points_mask = ~mask optimizable_tensors = self._prune_optimizer(valid_points_mask) self._xyz = optimizable_tensors["xyz"] self._features_dc = optimizable_tensors["f_dc"] self._features_rest = optimizable_tensors["f_rest"] self._opacity = optimizable_tensors["opacity"] self._scaling = optimizable_tensors["scaling"] self._rotation = optimizable_tensors["rotation"] if self.enable_semantic: self._objects_dc = optimizable_tensors["obj_dc"] self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask] self.denom = self.denom[valid_points_mask] self.max_radii2D = self.max_radii2D[valid_points_mask] def cat_tensors_to_optimizer(self, tensors_dict): optimizable_tensors = {} for group in self.optimizer.param_groups: if group["name"] in ["appearance_embeddings", "appearance_network", "classifier"]: continue assert len(group["params"]) == 1 extension_tensor = tensors_dict[group["name"]] stored_state = self.optimizer.state.get(group['params'][0], None) if stored_state is not None: stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0) stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0) del self.optimizer.state[group['params'][0]] group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) self.optimizer.state[group['params'][0]] = stored_state optimizable_tensors[group["name"]] = group["params"][0] else: group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True)) optimizable_tensors[group["name"]] = group["params"][0] return optimizable_tensors def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_objects_dc=None, reset=True): d = {"xyz": new_xyz, "f_dc": new_features_dc, "f_rest": new_features_rest, "opacity": new_opacities, "scaling" : new_scaling, "rotation" : new_rotation} if self.enable_semantic: d["obj_dc"] = new_objects_dc optimizable_tensors = self.cat_tensors_to_optimizer(d) self._xyz = optimizable_tensors["xyz"] self._features_dc = optimizable_tensors["f_dc"] self._features_rest = optimizable_tensors["f_rest"] self._opacity = optimizable_tensors["opacity"] self._scaling = optimizable_tensors["scaling"] self._rotation = optimizable_tensors["rotation"] if self.enable_semantic: self._objects_dc = optimizable_tensors["obj_dc"] if reset: self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") else: self.xyz_gradient_accum = torch.cat((self.xyz_gradient_accum, torch.zeros((new_xyz.shape[0], 1), device="cuda")), dim=0) self.denom = torch.cat((self.denom, torch.zeros((new_xyz.shape[0], 1), device="cuda")), dim=0) self.max_radii2D = torch.cat((self.max_radii2D, torch.zeros((new_xyz.shape[0]), device="cuda")), dim=0) def densify_and_split(self, grads, grad_threshold, scene_extent, visi=None, N=2): n_init_points = self.get_xyz.shape[0] # Extract points that satisfy the gradient condition padded_grad = torch.zeros((n_init_points), device="cuda") padded_grad[:grads.shape[0]] = grads.squeeze() selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False) selected_pts_mask = torch.logical_and(selected_pts_mask, torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent) if self.large_percent_dense is not None: densify_pts_mask = torch.max(self.get_scaling, dim=1).values > self.large_percent_dense * scene_extent inside, _ = self.get_inside_gaus_normalized() densify_pts_mask = torch.logical_and(densify_pts_mask, inside) if visi is not None: padded_vis = torch.zeros((n_init_points), device="cuda").bool() padded_vis[:visi.shape[0]] = visi densify_pts_mask = torch.logical_and(densify_pts_mask, padded_vis) selected_pts_mask = torch.logical_or(selected_pts_mask, densify_pts_mask) stds = self.get_scaling[selected_pts_mask].repeat(N,1) means =torch.zeros((stds.size(0), 3),device="cuda") samples = torch.normal(mean=means, std=stds) rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1) new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1) new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N)) new_rotation = self._rotation[selected_pts_mask].repeat(N,1) new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1) new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1) new_opacity = self._opacity[selected_pts_mask].repeat(N,1) new_objects_dc = self._objects_dc[selected_pts_mask].repeat(N,1,1) if self.enable_semantic else None self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_objects_dc) prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool))) self.prune_points(prune_filter) def get_dir_max_scaling(self, scaling, rots): ''' rots: N, 3, 3 ''' axis = torch.argmax(scaling, dim=-1) max_scaling = scaling[torch.arange(scaling.shape[0]), axis] dirs = rots.gather(2, axis[:, None, None].expand(-1, 3, -1)).squeeze(-1) return dirs, max_scaling, axis def densify_and_split_along_maxscaling(self, grads, grad_threshold, scene_extent, visi=None, N=2, n_std=2): n_init_points = self.get_xyz.shape[0] # Extract points that satisfy the gradient condition padded_grad = torch.zeros((n_init_points), device="cuda") padded_grad[:grads.shape[0]] = grads.squeeze() selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False) selected_pts_mask = torch.logical_and(selected_pts_mask, torch.max(self.get_scaling, dim=1).values > self.percent_dense*scene_extent) if self.large_percent_dense is not None and (torch.cuda.memory_allocated(0) / 1024**3 < self.max_mem): densify_pts_mask = torch.max(self.get_scaling, dim=1).values > self.large_percent_dense * scene_extent inside, _ = self.get_inside_gaus_normalized() densify_pts_mask = torch.logical_and(densify_pts_mask, inside) if visi is not None: padded_vis = torch.zeros((n_init_points), device="cuda").bool() padded_vis[:visi.shape[0]] = visi densify_pts_mask = torch.logical_and(densify_pts_mask, padded_vis) selected_pts_mask = torch.logical_or(selected_pts_mask, densify_pts_mask) scaling = self.get_scaling[selected_pts_mask] rots = build_rotation(self._rotation[selected_pts_mask]) dirs, max_scaling, axis = self.get_dir_max_scaling(scaling, rots) radii = (n_std * max_scaling / 3.)[..., None] # 3 std new_xyz1 = self.get_xyz[selected_pts_mask] + dirs * radii new_xyz2 = self.get_xyz[selected_pts_mask] - dirs * radii new_xyz = torch.cat((new_xyz1, new_xyz2), dim=0) new_scaling = scaling.detach().clone() new_scaling[torch.arange(new_scaling.shape[0]), axis] = max_scaling / (0.8*N) new_scaling = self.scaling_inverse_activation(new_scaling) new_scaling = torch.cat((new_scaling, new_scaling), dim=0) new_rotation = self._rotation[selected_pts_mask] new_rotation = torch.cat((new_rotation, new_rotation), dim=0) new_features_dc = self._features_dc[selected_pts_mask] new_features_dc = torch.cat((new_features_dc, new_features_dc), dim=0) new_features_rest = self._features_rest[selected_pts_mask] new_features_rest = torch.cat((new_features_rest, new_features_rest), dim=0) new_opacity = self._opacity[selected_pts_mask] new_opacity = torch.cat((new_opacity, new_opacity), dim=0) new_opacity = self._opacity[selected_pts_mask].repeat(N,1) new_objects_dc = self._objects_dc[selected_pts_mask].repeat(N,1,1) if self.enable_semantic else None self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_objects_dc) prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool))) self.prune_points(prune_filter) def densify_and_clone(self, grads, grad_threshold, scene_extent): # Extract points that satisfy the gradient condition selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False) selected_pts_mask = torch.logical_and(selected_pts_mask, torch.max(self.get_scaling, dim=1).values <= self.percent_dense*scene_extent) new_xyz = self._xyz[selected_pts_mask] new_features_dc = self._features_dc[selected_pts_mask] new_features_rest = self._features_rest[selected_pts_mask] new_opacities = self._opacity[selected_pts_mask] new_scaling = self._scaling[selected_pts_mask] new_rotation = self._rotation[selected_pts_mask] new_objects_dc = self._objects_dc[selected_pts_mask] if self.enable_semantic else None self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_objects_dc) def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size, visi=None): grads = self.xyz_gradient_accum / self.denom grads[grads.isnan()] = 0.0 self.densify_and_clone(grads, max_grad, extent) self.densify_and_split_along_maxscaling(grads, max_grad, extent, visi=visi) prune_mask = (self.get_opacity < min_opacity).squeeze() if max_screen_size: big_points_vs = self.max_radii2D > max_screen_size big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws) self.prune_points(prune_mask) torch.cuda.empty_cache() def prune_gaussians(self, percent, import_score: list): sorted_tensor, _ = torch.sort(import_score, dim=0) index_nth_percentile = int(percent * (sorted_tensor.shape[0] - 1)) value_nth_percentile = sorted_tensor[index_nth_percentile] prune_mask = (import_score <= value_nth_percentile).squeeze() # TODO(Kevin) Emergent, change it back. This is just for testing self.prune_points(prune_mask) def add_densification_stats(self, viewspace_point_tensor, update_filter): self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor.grad[update_filter,:2], dim=-1, keepdim=True) self.denom[update_filter] += 1 def get_inside_gaus_normalized(self): inside, pts = get_inside_normalized(self.get_xyz, self.trans, self.scale) return inside, pts def normalize_pts(self, pts): pts = normalize_pts(pts, self.trans, self.scale) return pts def filter_points(self, nb_points=5, radius=0.01, std_ratio=0.01): inside, _ = self.get_inside_gaus_normalized() xyz = self.get_xyz[inside] filte_valid = remove_radius_outlier(xyz, nb_points, radius*self.extent) inside[inside.clone()] = filte_valid return inside def prune_outside(self): inside, _ = self.get_inside_gaus_normalized() self.prune_points(~inside) def prune_outliers(self): mask = torch.ones(self.get_xyz.shape[0], dtype=torch.bool, device="cuda") valid = self.filter_points() mask[valid] = False self.prune_points(mask) def prune_semantics(self, cls=BACKGROUND): mask = torch.ones(self.get_xyz.shape[0], dtype=torch.bool, device="cuda") mask[self.get_cls() != cls] = False self.prune_points(mask) if __name__ == '__main__': model = GaussianModel(2) m2 = deepcopy(model) ================================================ FILE: tools/__init__.py ================================================ ================================================ FILE: tools/camera.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import numpy as np import torch class Pose(): """ A class of operations on camera poses (PyTorch tensors with shape [...,3,4]). Each [3,4] camera pose takes the form of [R|t]. """ def __call__(self, R=None, t=None): # Construct a camera pose from the given R and/or t. assert R is not None or t is not None if R is None: if not isinstance(t, torch.Tensor): t = torch.tensor(t) R = torch.eye(3, device=t.device).repeat(*t.shape[:-1], 1, 1) elif t is None: if not isinstance(R, torch.Tensor): R = torch.tensor(R) t = torch.zeros(R.shape[:-1], device=R.device) else: if not isinstance(R, torch.Tensor): R = torch.tensor(R) if not isinstance(t, torch.Tensor): t = torch.tensor(t) assert R.shape[:-1] == t.shape and R.shape[-2:] == (3, 3) R = R.float() t = t.float() pose = torch.cat([R, t[..., None]], dim=-1) assert pose.shape[-2:] == (3, 4) return pose def invert(self, pose, use_inverse=False): # Invert a camera pose. R, t = pose[..., :3], pose[..., 3:] R_inv = R.inverse() if use_inverse else R.transpose(-1, -2) t_inv = (-R_inv @ t)[..., 0] pose_inv = self(R=R_inv, t=t_inv) return pose_inv def compose(self, pose_list): # Compose a sequence of poses together. # pose_new(x) = poseN o ... o pose2 o pose1(x) pose_new = pose_list[0] for pose in pose_list[1:]: pose_new = self.compose_pair(pose_new, pose) return pose_new def compose_pair(self, pose_a, pose_b): R_a, t_a = pose_a[..., :3], pose_a[..., 3:] R_b, t_b = pose_b[..., :3], pose_b[..., 3:] R_new = R_b @ R_a t_new = (R_b @ t_a + t_b)[..., 0] pose_new = self(R=R_new, t=t_new) return pose_new def scale_center(self, pose, scale): """Scale the camera center from the origin. 0 = R@c+t --> c = -R^T@t (camera center in world coordinates) 0 = R@(sc)+t' --> t' = -R@(sc) = -R@(-R^T@st) = st """ R, t = pose[..., :3], pose[..., 3:] pose_new = torch.cat([R, t * scale], dim=-1) return pose_new def interpolate(self, pose_a, pose_b, alpha): """Interpolate between two poses with Slerp. Args: pose_a (tensor [...,3,4]): Pose at time t=0. pose_b (tensor [...,3,4]): Pose at time t=1. alpha (tensor [...,1]): Interpolation parameter. Returns: pose (tensor [...,3,4]): Pose at time t. """ R_a, t_a = pose_a[..., :3], pose_a[..., 3:] R_b, t_b = pose_b[..., :3], pose_b[..., 3:] q_a = quaternion.R_to_q(R_a) # [...,4] q_b = quaternion.R_to_q(R_b) # [...,4] q_intp = quaternion.interpolate(q_a, q_b, alpha) # [...,4] R_intp = quaternion.q_to_R(q_intp) # [...,3,3] t_intp = (1 - alpha) * t_a + alpha * t_b # [...,3] pose_intp = torch.cat([R_intp, t_intp], dim=-1) # [...,3,4] return pose_intp class Lie(): """ Lie algebra for SO(3) and SE(3) operations in PyTorch. """ def so3_to_SO3(self, w): # [..., 3] wx = self.skew_symmetric(w) theta = w.norm(dim=-1)[..., None, None] eye = torch.eye(3, device=w.device, dtype=torch.float32) A = self.taylor_A(theta) B = self.taylor_B(theta) R = eye + A * wx + B * wx @ wx return R def SO3_to_so3(self, R, eps=1e-7): # [..., 3, 3] trace = R[..., 0, 0] + R[..., 1, 1] + R[..., 2, 2] theta = ((trace - 1) / 2).clamp(-1 + eps, 1 - eps).acos_()[ ..., None, None] % np.pi # ln(R) will explode if theta==pi lnR = 1 / (2 * self.taylor_A(theta) + 1e-8) * (R - R.transpose(-2, -1)) # FIXME: wei-chiu finds it weird w0, w1, w2 = lnR[..., 2, 1], lnR[..., 0, 2], lnR[..., 1, 0] w = torch.stack([w0, w1, w2], dim=-1) return w def se3_to_SE3(self, wu): # [...,3] w, u = wu.split([3, 3], dim=-1) wx = self.skew_symmetric(w) theta = w.norm(dim=-1)[..., None, None] eye = torch.eye(3, device=w.device, dtype=torch.float32) A = self.taylor_A(theta) B = self.taylor_B(theta) C = self.taylor_C(theta) R = eye + A * wx + B * wx @ wx V = eye + B * wx + C * wx @ wx Rt = torch.cat([R, (V @ u[..., None])], dim=-1) return Rt def SE3_to_se3(self, Rt, eps=1e-8): # [...,3,4] R, t = Rt.split([3, 1], dim=-1) w = self.SO3_to_so3(R) wx = self.skew_symmetric(w) theta = w.norm(dim=-1)[..., None, None] eye = torch.eye(3, device=w.device, dtype=torch.float32) A = self.taylor_A(theta) B = self.taylor_B(theta) invV = eye - 0.5 * wx + (1 - A / (2 * B)) / (theta ** 2 + eps) * wx @ wx u = (invV @ t)[..., 0] wu = torch.cat([w, u], dim=-1) return wu def skew_symmetric(self, w): w0, w1, w2 = w.unbind(dim=-1) zero = torch.zeros_like(w0) wx = torch.stack([torch.stack([zero, -w2, w1], dim=-1), torch.stack([w2, zero, -w0], dim=-1), torch.stack([-w1, w0, zero], dim=-1)], dim=-2) return wx def taylor_A(self, x, nth=10): # Taylor expansion of sin(x)/x. ans = torch.zeros_like(x) denom = 1. for i in range(nth + 1): if i > 0: denom *= (2 * i) * (2 * i + 1) ans = ans + (-1) ** i * x ** (2 * i) / denom return ans def taylor_B(self, x, nth=10): # Taylor expansion of (1-cos(x))/x**2. ans = torch.zeros_like(x) denom = 1. for i in range(nth + 1): denom *= (2 * i + 1) * (2 * i + 2) ans = ans + (-1) ** i * x ** (2 * i) / denom return ans def taylor_C(self, x, nth=10): # Taylor expansion of (x-sin(x))/x**3. ans = torch.zeros_like(x) denom = 1. for i in range(nth + 1): denom *= (2 * i + 2) * (2 * i + 3) ans = ans + (-1) ** i * x ** (2 * i) / denom return ans class Quaternion(): def q_to_R(self, q): # [...,4] # https://en.wikipedia.org/wiki/Rotation_matrix#Quaternion qa, qb, qc, qd = q.unbind(dim=-1) R = torch.stack( [torch.stack([1 - 2 * (qc ** 2 + qd ** 2), 2 * (qb * qc - qa * qd), 2 * (qa * qc + qb * qd)], dim=-1), torch.stack([2 * (qb * qc + qa * qd), 1 - 2 * (qb ** 2 + qd ** 2), 2 * (qc * qd - qa * qb)], dim=-1), torch.stack([2 * (qb * qd - qa * qc), 2 * (qa * qb + qc * qd), 1 - 2 * (qb ** 2 + qc ** 2)], dim=-1)], dim=-2) return R def R_to_q(self, R, eps=1e-6): # [...,3,3] # https://en.wikipedia.org/wiki/Rotation_matrix#Quaternion row0, row1, row2 = R.unbind(dim=-2) R00, R01, R02 = row0.unbind(dim=-1) R10, R11, R12 = row1.unbind(dim=-1) R20, R21, R22 = row2.unbind(dim=-1) t = R[..., 0, 0] + R[..., 1, 1] + R[..., 2, 2] r = (1 + t + eps).sqrt() qa = 0.5 * r qb = (R21 - R12).sign() * 0.5 * (1 + R00 - R11 - R22 + eps).sqrt() qc = (R02 - R20).sign() * 0.5 * (1 - R00 + R11 - R22 + eps).sqrt() qd = (R10 - R01).sign() * 0.5 * (1 - R00 - R11 + R22 + eps).sqrt() q = torch.stack([qa, qb, qc, qd], dim=-1) return q def invert(self, q): # [...,4] qa, qb, qc, qd = q.unbind(dim=-1) norm = q.norm(dim=-1, keepdim=True) q_inv = torch.stack([qa, -qb, -qc, -qd], dim=-1) / norm ** 2 return q_inv def product(self, q1, q2): # [...,4] q1a, q1b, q1c, q1d = q1.unbind(dim=-1) q2a, q2b, q2c, q2d = q2.unbind(dim=-1) hamil_prod = torch.stack([q1a * q2a - q1b * q2b - q1c * q2c - q1d * q2d, q1a * q2b + q1b * q2a + q1c * q2d - q1d * q2c, q1a * q2c - q1b * q2d + q1c * q2a + q1d * q2b, q1a * q2d + q1b * q2c - q1c * q2b + q1d * q2a], dim=-1) return hamil_prod def interpolate(self, q1, q2, alpha): # [...,4],[...,4],[...,1] # https://en.wikipedia.org/wiki/Slerp cos_angle = (q1 * q2).sum(dim=-1, keepdim=True) # [...,1] flip = cos_angle < 0 q1 = q1 * (~flip) - q1 * flip # [...,4] theta = cos_angle.abs().acos() # [...,1] slerp = (((1 - alpha) * theta).sin() * q1 + (alpha * theta).sin() * q2) / theta.sin() # [...,4] return slerp pose = Pose() lie = Lie() quaternion = Quaternion() def to_hom(X): # Get homogeneous coordinates of the input. X_hom = torch.cat([X, torch.ones_like(X[..., :1])], dim=-1) return X_hom # Basic operations of transforming 3D points between world/camera/image coordinates. def world2cam(X, pose): # [B,N,3] X_hom = to_hom(X) return X_hom @ pose.transpose(-1, -2) def cam2img(X, cam_intr): return X @ cam_intr.transpose(-1, -2) def img2cam(X, cam_intr): return X @ cam_intr.inverse().transpose(-1, -2) def cam2world(X, pose): X_hom = to_hom(X) pose_inv = Pose().invert(pose) return X_hom @ pose_inv.transpose(-1, -2) def angle_to_rotation_matrix(a, axis): # Get the rotation matrix from Euler angle around specific axis. roll = dict(X=1, Y=2, Z=0)[axis] if isinstance(a, float): a = torch.tensor(a) zero = torch.zeros_like(a) eye = torch.ones_like(a) M = torch.stack([torch.stack([a.cos(), -a.sin(), zero], dim=-1), torch.stack([a.sin(), a.cos(), zero], dim=-1), torch.stack([zero, zero, eye], dim=-1)], dim=-2) M = M.roll((roll, roll), dims=(-2, -1)) return M def get_center_and_ray(pose, intr, image_size): """ Args: pose (tensor [3,4]/[B,3,4]): Camera pose. intr (tensor [3,3]/[B,3,3]): Camera intrinsics. image_size (list of int): Image size. Returns: center_3D (tensor [HW,3]/[B,HW,3]): Center of the camera. ray (tensor [HW,3]/[B,HW,3]): Ray of the camera with depth=1 (note: not unit ray). """ H, W = image_size # Given the intrinsic/extrinsic matrices, get the camera center and ray directions. with torch.no_grad(): # Compute image coordinate grid. y_range = torch.arange(H, dtype=torch.float32, device=pose.device).add_(0.5) x_range = torch.arange(W, dtype=torch.float32, device=pose.device).add_(0.5) Y, X = torch.meshgrid(y_range, x_range, indexing="ij") # [H,W] xy_grid = torch.stack([X, Y], dim=-1).view(-1, 2) # [HW,2] # Compute center and ray. if len(pose.shape) == 3: batch_size = len(pose) xy_grid = xy_grid.repeat(batch_size, 1, 1) # [B,HW,2] grid_3D = img2cam(to_hom(xy_grid), intr) # [HW,3]/[B,HW,3] center_3D = torch.zeros_like(grid_3D) # [HW,3]/[B,HW,3] # Transform from camera to world coordinates. grid_3D = cam2world(grid_3D, pose) # [HW,3]/[B,HW,3] center_3D = cam2world(center_3D, pose) # [HW,3]/[B,HW,3] ray = grid_3D - center_3D # [B,HW,3] return center_3D, ray def get_3D_points_from_dist(center, ray_unit, dist, multi=True): # Two possible use cases: (1) center + ray_unit * dist, or (2) center + ray * depth if multi: center, ray_unit = center[..., None, :], ray_unit[..., None, :] # [...,1,3] # x = c+dv points_3D = center + ray_unit * dist # [...,3]/[...,N,3] return points_3D def convert_NDC(center, ray, intr, near=1): # Shift camera center (ray origins) to near plane (z=1). # (Unlike conventional NDC, we assume the cameras are facing towards the +z direction.) center = center + (near - center[..., 2:]) / ray[..., 2:] * ray # Projection. cx, cy, cz = center.unbind(dim=-1) # [...,R] rx, ry, rz = ray.unbind(dim=-1) # [...,R] scale_x = intr[..., 0, 0] / intr[..., 0, 2] # [...] scale_y = intr[..., 1, 1] / intr[..., 1, 2] # [...] cnx = scale_x[..., None] * (cx / cz) cny = scale_y[..., None] * (cy / cz) cnz = 1 - 2 * near / cz rnx = scale_x[..., None] * (rx / rz - cx / cz) rny = scale_y[..., None] * (ry / rz - cy / cz) rnz = 2 * near / cz center_ndc = torch.stack([cnx, cny, cnz], dim=-1) # [...,R,3] ray_ndc = torch.stack([rnx, rny, rnz], dim=-1) # [...,R,3] return center_ndc, ray_ndc def convert_NDC2(center, ray, intr): # Similar to convert_NDC() but shift the ray origins to its own image plane instead of the global near plane. # Also this version is much more interpretable. scale_x = intr[..., 0, 0] / intr[..., 0, 2] # [...] scale_y = intr[..., 1, 1] / intr[..., 1, 2] # [...] # Get the metric image plane (i.e. new "center"): (sx*cx/cz, sy*cy/cz, 1-2/cz). center = center + ray # This is the key difference. cx, cy, cz = center.unbind(dim=-1) # [...,R] image_plane = torch.stack([scale_x[..., None] * cx / cz, scale_x[..., None] * cy / cz, 1 - 2 / cz], dim=-1) # Get the infinity plane: (sx*rx/rz, sy*ry/rz, 1). rx, ry, rz = ray.unbind(dim=-1) # [...,R] inf_plane = torch.stack([scale_x[..., None] * rx / rz, scale_y[..., None] * ry / rz, torch.ones_like(rz)], dim=-1) # The NDC ray is the difference between the two planes, assuming t \in [0,1]. ndc_ray = inf_plane - image_plane return image_plane, ndc_ray def rotation_distance(R1, R2, eps=1e-7): # http://www.boris-belousov.net/2016/12/01/quat-dist/ R_diff = R1 @ R2.transpose(-2, -1) trace = R_diff[..., 0, 0] + R_diff[..., 1, 1] + R_diff[..., 2, 2] angle = ((trace - 1) / 2).clamp(-1 + eps, 1 - eps).acos_() # numerical stability near -1/+1 return angle def get_oscil_novel_view_poses(N=60, angle=0.05, dist=5): # Create circular viewpoints (small oscillations). theta = torch.arange(N) / N * 2 * np.pi R_x = angle_to_rotation_matrix((theta.sin() * angle).asin(), "X") R_y = angle_to_rotation_matrix((theta.cos() * angle).asin(), "Y") pose_rot = pose(R=R_y @ R_x) pose_shift = pose(t=[0, 0, dist]) pose_oscil = pose.compose([pose.invert(pose_shift), pose_rot, pose_shift]) return pose_oscil def cross_product_matrix(x): """Matrix form of cross product opertaion. param x: [3,] tensor. return: [3, 3] tensor representing the matrix form of cross product. """ return torch.tensor( [[0, -x[2], x[1]], [x[2], 0, -x[0]], [-x[1], x[0], 0, ]] ) def essential_matrix(poses): """Compute Essential Matrix from a relative pose. param poses: [views, 3, 4] tensor representing relative poses. return: [views, 3, 3] tensor representing Essential Matrix. """ r = poses[..., 0:3] t = poses[..., 3] tx = torch.stack([cross_product_matrix(tt) for tt in t], axis=0) return tx @ r def fundamental_matrix(poses, intr1, intr2): """Compute Fundamental Matrix from a relative pose and intrinsics. param poses: [views, 3, 4] tensor representing relative poses. intr1: [3, 3] tensor. Camera intrinsic of reference image. intr2: [views, 3, 3] tensor. Camera Intrinsic of target image. return: [views, 3, 3] tensor representing Fundamental Matrix. """ return intr2.inverse().transpose(-1, -2) @ essential_matrix(poses) @ intr1.inverse() def get_ray_depth_plane_intersection(center, ray, depths): """Compute the intersection of a ray with a depth plane. Args: center (tensor [B,HW,3]): Camera center of the target pose. ray (tensor [B,HW,3]): Ray direction of the target pose. depth (tensor [L]): The depth values from the source view (e.g. for MPI planes). Returns: intsc_points (tensor [B,HW,L,3]): Intersecting 3D points with the MPI. """ # Each 3D point x along the ray v from center c can be written as x = c+t*v. # Plane equation: n@x = d, where normal n = (0,0,1), d = depth. # --> t = (d-n@c)/(n@v). # --> x = c+t*v = c+(d-n@c)/(n@v)*v. center, ray = center[:, :, None], ray[:, :, None] # [B,HW,L,3], [B,HW,1,3] depths = depths[None, None, :, None] # [1,1,L,1] intsc_points = center + (depths - center[..., 2:]) / ray[..., 2:] * ray # [B,HW,L,3] return intsc_points def unit_view_vector_to_rotation_matrix(v, axes="ZYZ"): """ Args: v (tensor [...,3]): Unit vectors on the view sphere. axes: rotation axis order. Returns: rotation_matrix (tensor [...,3,3]): rotation matrix R @ v + [0, 0, 1] = 0. """ alpha = torch.arctan2(v[..., 1], v[..., 0]) # [...] beta = np.pi - v[..., 2].arccos() # [...] euler_angles = torch.stack([torch.ones_like(alpha) * np.pi / 2, -beta, alpha], dim=-1) # [...,3] rot2 = angle_to_rotation_matrix(euler_angles[..., 2], axes[2]) # [...,3,3] rot1 = angle_to_rotation_matrix(euler_angles[..., 1], axes[1]) # [...,3,3] rot0 = angle_to_rotation_matrix(euler_angles[..., 0], axes[0]) # [...,3,3] rot = rot2 @ rot1 @ rot0 # [...,3,3] return rot.transpose(-2, -1) def sample_on_spherical_cap(anchor, N, max_angle): """Sample n points on the view hemisphere within the angle to x. Args: anchor (tensor [...,3]): Reference 3-D unit vector on the view hemisphere. N (int): Number of sampled points. max_angle (float): Sampled points should have max angle to x. Returns: sampled_points (tensor [...,N,3]): Sampled points on the spherical caps. """ batch_shape = anchor.shape[:-1] # First, sample uniformly on a unit 2D disk. radius = torch.rand(*batch_shape, N, device=anchor.device) # [...,N] theta = torch.rand(*batch_shape, N, device=anchor.device) * 2 * np.pi # [...,N] x = radius.sqrt() * theta.cos() # [...,N] y = radius.sqrt() * theta.sin() # [...,N] # Reparametrize to a unit spherical cap with height h. # http://marc-b-reynolds.github.io/distribution/2016/11/28/Uniform.html h = 1 - np.cos(max_angle) # spherical cap height k = h * radius # [...,N] s = (h * (2 - k)).sqrt() # [...,N] points = torch.stack([s * x, s * y, 1 - k], dim=-1) # [...,N,3] # Transform to center around the anchor. ref_z = torch.tensor([0., 0., 1.], device=anchor.device) v = -anchor.cross(ref_z) # [...,3] ss_v = lie.skew_symmetric(v) # [...,3,3] R = torch.eye(3, device=anchor.device) + ss_v + ss_v @ ss_v / (1 + anchor @ ref_z)[..., None, None] # [...,3,3] points = points @ R.transpose(-2, -1) # [...,N,3] return points def sample_on_spherical_cap_northern(anchor, N, max_angle, away_from=None, max_reject_count=None): """Sample n points only the northern view hemisphere within the angle to x.""" def find_invalid_points(points): southern = points[..., 2] < 0 # [...,N] if away_from is not None: cosine_ab = (away_from * anchor).sum(dim=-1, keepdim=True) # [...,1] cosine_ac = (away_from[..., None, :] * points).sum(dim=-1) # [...,N] not_outwards = cosine_ab < cosine_ac # [...,N] invalid = southern | not_outwards else: invalid = southern return invalid assert (anchor[..., 2] > 0).all() assert anchor.norm(dim=-1).allclose(torch.ones_like(anchor[..., 0])) points = sample_on_spherical_cap(anchor, N, max_angle) # [...,N,3] invalid = find_invalid_points(points) count = 0 while invalid.any(): # Reject and resample. points_resample = sample_on_spherical_cap(anchor, N, max_angle) points[invalid] = points_resample[invalid] invalid = find_invalid_points(points) count += 1 if max_reject_count and count > max_reject_count: points = anchor.repeat(N, 1) return points ================================================ FILE: tools/camera_utils.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import math import torch import numpy as np from tqdm import tqdm from scipy.spatial.transform import Rotation as R try: from scene.cameras import Camera except ImportError: pass from tools.general_utils import PILtoTorch, NumpytoTorch from tools.graphics_utils import fov2focal from tools.math_utils import inv_normalize_pts from scene.cameras import SampleCam WARNED = False def loadCam(args, id, cam_info, resolution_scale): orig_w, orig_h = cam_info.image.size if args.resolution in [1, 2, 4, 8]: resolution = round(orig_w/(resolution_scale * args.resolution)), round(orig_h/(resolution_scale * args.resolution)) else: # should be a type that converts to float if args.resolution == -1: if orig_w > 1600: global WARNED if not WARNED: print("[ INFO ] Encountered quite large input images (>1.6K pixels width), rescaling to 1.6K.\n " "If this is not desired, please explicitly specify '--resolution/-r' as 1") WARNED = True global_down = orig_w / 1600 else: global_down = 1 else: global_down = orig_w / args.resolution scale = float(global_down) * float(resolution_scale) resolution = (int(orig_w / scale), int(orig_h / scale)) resized_image_rgb = PILtoTorch(cam_info.image, resolution) / 255. gt_image = resized_image_rgb[:3, ...] loaded_mask = None if resized_image_rgb.shape[0] == 4: loaded_mask = resized_image_rgb[3:4, ...] depth = None if cam_info.depth is not None: size = list(resolution)[::-1] depth = NumpytoTorch(cam_info.depth, size) normal = None if cam_info.normal is not None: size = list(resolution)[::-1] normal = NumpytoTorch(cam_info.normal, size).permute(1, 2, 0) # H, W, 3 mask = None if cam_info.mask is not None: mask = PILtoTorch(cam_info.mask, resolution).squeeze(0) if mask.dim() == 3: mask = mask[0] return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T, FoVx=cam_info.FovX, FoVy=cam_info.FovY, image=gt_image, gt_alpha_mask=loaded_mask, image_name=cam_info.image_name, uid=id, data_device=args.data_device, depth=depth, normal=normal, mask=mask) def cameraList_from_camInfos(cam_infos, resolution_scale, args): camera_list = [] for id, c in tqdm(enumerate(cam_infos), total=len(cam_infos), desc="Processing data", leave=False): camera_list.append(loadCam(args, id, c, resolution_scale)) return camera_list def camera_to_JSON(id, camera): Rt = np.zeros((4, 4)) Rt[:3, :3] = camera.R.transpose() Rt[:3, 3] = camera.T Rt[3, 3] = 1.0 W2C = np.linalg.inv(Rt) pos = W2C[:3, 3] rot = W2C[:3, :3] serializable_array_2d = [x.tolist() for x in rot] camera_entry = { 'id' : id, 'img_name' : camera.image_name, 'width' : camera.width, 'height' : camera.height, 'position': pos.tolist(), 'rotation': serializable_array_2d, 'fy' : fov2focal(camera.FovY, camera.height), 'fx' : fov2focal(camera.FovX, camera.width) } return camera_entry def find_up_axis(R): ''' R: world to bounding box coordinate system ''' up_vector = torch.tensor([0, -1, 0], dtype=torch.float32, device=R.device) # world colmap up_vector = R @ up_vector # bounding box coordinate system up_axis = torch.argmax(torch.abs(up_vector)) up_sign = torch.sign(up_vector[up_axis]) return up_axis, up_sign def find_axis(R, axis_name='up'): ''' colmap coordinate system R: world to bounding box coordinate system ''' if axis_name == 'up': axis_w=[0, -1, 0] elif axis_name == 'front': axis_w=[0, 0, 1] elif axis_name == 'right': axis_w=[1, 0, 0] else: raise ValueError(f'axis_name: "{axis_name}" should be one of [up, front, right]') axis_w = torch.tensor(axis_w, dtype=torch.float32, device=R.device) # world colmap axis_c = R @ axis_w # bounding box coordinate system axis = torch.argmax(torch.abs(axis_c)) sign = torch.sign(axis_c[axis]) return axis, sign def dot(x, y): if isinstance(x, np.ndarray): return np.sum(x * y, -1, keepdims=True) else: return torch.sum(x * y, -1, keepdim=True) def length(x, eps=1e-20): if isinstance(x, np.ndarray): return np.sqrt(np.maximum(np.sum(x * x, axis=-1, keepdims=True), eps)) else: return torch.sqrt(torch.clamp(dot(x, x), min=eps)) def safe_normalize(x, eps=1e-20): return x / length(x, eps) def look_at_np(campos, target, opengl=True): # campos: [N, 3], camera/eye position # target: [N, 3], object to look at # return: [N, 3, 3], rotation matrix if not opengl: # camera forward aligns with -z colmap forward_vector = safe_normalize(target - campos) up_vector = np.array([0, 1, 0], dtype=np.float32) right_vector = safe_normalize(np.cross(forward_vector, up_vector)) # z x up up_vector = safe_normalize(np.cross(right_vector, forward_vector)) else: # camera forward aligns with +z forward_vector = safe_normalize(campos - target) up_vector = np.array([0, 1, 0], dtype=np.float32) right_vector = safe_normalize(np.cross(up_vector, forward_vector)) # up x z up_vector = safe_normalize(np.cross(forward_vector, right_vector)) R = np.stack([right_vector, up_vector, forward_vector], axis=1) # axis=1 !!!!! 把行拼起来了 w2c return R def look_at(campos, target, opengl=True): # campos: [N, 3], camera/eye position # target: [N, 3], object to look at # return: [N, 3, 3], rotation matrix up_vector = torch.tensor([0, 1, 0], dtype=torch.float32, device=campos.device) if campos.dim() == 2: up_vector = up_vector[None, :] if not opengl: # camera forward aligns with -z colmap forward_vector = safe_normalize(target - campos) right_vector = safe_normalize(torch.cross(forward_vector, up_vector)) # z x up up_vector = safe_normalize(torch.cross(right_vector, forward_vector)) else: # camera forward aligns with +z forward_vector = safe_normalize(campos - target) right_vector = safe_normalize(torch.cross(up_vector, forward_vector)) # up x z up_vector = safe_normalize(torch.cross(forward_vector, right_vector)) R = torch.stack([right_vector, up_vector, forward_vector], dim=1) # axis=1 !!!!! 把行拼起来了 w2c return R # elevation & azimuth to pose (cam2world) matrix def orbit_camera(elevation, azimuth, radius=1, is_degree=True, target=None, opengl=True): # radius: scalar # elevation: scalar, in (-90, 90), from +y to -y is (-90, 90) # azimuth: scalar, in (-180, 180), from +z to +x is (0, 90) # return: [4, 4], camera pose matrix if is_degree: elevation = np.deg2rad(elevation) azimuth = np.deg2rad(azimuth) x = radius * np.cos(elevation) * np.sin(azimuth) y = - radius * np.sin(elevation) z = radius * np.cos(elevation) * np.cos(azimuth) if target is None: target = np.zeros([3], dtype=np.float32) campos = np.array([x, y, z]) + target # [3] T = np.eye(4, dtype=np.float32) T[:3, :3] = look_at_np(campos, target, opengl) # ??? should be look_at(campos, target, opengl).transpose(0, 2, 1) T[:3, 3] = campos return T def cubic_camera(n, trans, scale, target=None, opengl=False): xyz = np.random.rand(n, 3) * 2 - 1 for i in range(3): xyz[i::3, i] = xyz[i::3, i] / np.abs(xyz[i::3, i]) # Unit cube if target is None: target = np.zeros([1, 3], dtype=np.float32) xyz = inv_normalize_pts(xyz, trans, scale) target = inv_normalize_pts(target, trans, scale) T = np.zeros((n, 4, 4)) up_vector = [1, 0, 0] T[:, :3, :3] = look_at(xyz, target, opengl, up_vector) # c2w T[:, :3, 3] = xyz T[:, 3, 3] = 1 T = np.linalg.inv(T) # w2c return T def check_tensor(x): if isinstance(x, np.ndarray): return torch.from_numpy(x).to(torch.float32) else: return x def up_camera(n, trans, scale, target=None, opengl=False): # colmap trans = check_tensor(trans) scale = check_tensor(scale) device = trans.device up_axis, up_sign = find_up_axis(trans[:3, :3]) v_axis = [i for i in [0, 1, 2] if i != up_axis] xyz = torch.rand(n, 3).to(device) * 2 - 1 xyz[:, up_axis] = up_sign # up if target is None: target = check_tensor(target) target = torch.zeros([1, 3], dtype=torch.float32, device=device) target[:, up_axis] = 1 * -up_sign # 5 xyz = inv_normalize_pts(xyz, trans, scale) target = inv_normalize_pts(target, trans, scale) T = torch.zeros((xyz.shape[0], 4, 4), device=device) # w2c R = look_at(xyz, target, opengl) # w2c T[:, :3, :3] = R T[:, :3, 3] = - (R @ xyz[..., None]).squeeze(-1) # w2c T[:, 3, 3] = 1 return T def around_camera(n, trans, scale, height=None, target=None, opengl=False): trans = check_tensor(trans) scale = check_tensor(scale) device = trans.device grid_points = torch.Tensor([ [-1, -1, -1], [1, 1, 1], ]).to(device) up_axis, up_sign = find_up_axis(trans[:3, :3]) v_axis = [i for i in [0, 1, 2] if i != up_axis] xyz = torch.rand(n, 3).to(device) * 2 - 1 for i in v_axis: xyz[i-1::2, i] = xyz[i-1::2, i] / torch.abs(xyz[i-1::2, i]) if target is None: target = check_tensor(target) target = torch.zeros([1, 3], dtype=torch.float32, device=device) xyz = inv_normalize_pts(xyz, trans, scale) target = inv_normalize_pts(target, trans, scale) grid_points = inv_normalize_pts(grid_points, trans, scale) if height is None: height = target[0, 1] xyz[:, 1] = height T = torch.zeros((xyz.shape[0], 4, 4), device=device) # w2c R = look_at(xyz, target, opengl) # w2c T[:, :3, :3] = R T[:, :3, 3] = - (R @ xyz[..., None]).squeeze(-1) # w2c T[:, 3, 3] = 1 return T def bb_camera(n, trans, scale, height=None, target=None, opengl=False, up=True, around=True, look_mode='target', sample_mode='grid', boundary=0.9, bidirect=False): # colmap 0.8 trans = check_tensor(trans) scale = check_tensor(scale) device = trans.device if scale.ndim == 0: scale = torch.ones(3, dtype=torch.float32, device=device) * scale rot = trans[:3, :3] if trans.ndim == 2 else torch.eye(3, device=device) up_axis, up_sign = find_axis(rot, axis_name='up') if sample_mode == 'grid' or (up and around): right_axis, right_sign = find_axis(rot, axis_name='right') front_axis, front_sign = find_axis(rot, axis_name='front') v_axis = [i for i in [0, 1, 2] if i != up_axis] up_n = around_n = n if up and around: h = scale[up_axis] l = scale[right_axis] w = scale[front_axis] around_area = 2 * (l * h + h * w) up_area = l * w total_area = around_area + up_area up_n = int((n * up_area / total_area) * 1) xyz = [] if target is None: if look_mode == 'target': target = torch.zeros([1, 3], dtype=torch.float32, device=device) target[:, up_axis] = 1 * -up_sign # 5 else: target = [] else: target = check_tensor(target) if up: if sample_mode == 'random': xyz_up = torch.rand(up_n, 3).to(device) * 2 - 1 elif sample_mode == 'grid': xyz_up = up_grid_posi(up_n, scale, right_axis, up_axis, front_axis).to(device) around_n = n - up_n xyz_up[:, up_axis] = up_sign xyz.append(xyz_up) if look_mode == 'direction': tgt_up = xyz_up.clone() tgt_up[:, up_axis] *= -1 target.append(tgt_up) if around: if sample_mode == 'random': xyz_around = torch.rand(around_n, 3).to(device) * 2 - 1 elif sample_mode == 'grid': if not bidirect: xyz_around = around_grid_posi(around_n, scale, right_axis, up_axis, front_axis, up_sign=up_sign).to(device) else: n1 = around_n // 2 xyz1 = around_grid_posi(n1, scale, right_axis, up_axis, front_axis, sign=1, up_sign=up_sign).to(device) n2 = around_n - xyz1.shape[0] xyz2 = around_grid_posi(n2, scale, right_axis, up_axis, front_axis, sign=-1, up_sign=up_sign).to(device) xyz_around = torch.cat([xyz1, xyz2], 0) n_trg = xyz_up.shape[0] + xyz_around.shape[0] if up else xyz_around.shape[0] target = target.repeat(n_trg, 1) target[-xyz2.shape[0]:, up_axis] *= -1 xyz_around[:, up_axis] = xyz_around[:, up_axis] * boundary + (1 - boundary) * up_sign xyz.append(xyz_around) if look_mode == 'direction': trg_around = xyz_around.clone() for i in v_axis: trg_around[i-1::2, i] *= -1 target.append(trg_around) xyz = torch.cat(xyz, 0) if look_mode == 'direction': target = torch.cat(target, 0) xyz = inv_normalize_pts(xyz, trans, scale) target = inv_normalize_pts(target, trans, scale) T = torch.zeros((xyz.shape[0], 4, 4), device=device) # w2c R = look_at(xyz, target, opengl) # w2c T[:, :3, :3] = R T[:, :3, 3] = - (R @ xyz[..., None]).squeeze(-1) # w2c T[:, 3, 3] = 1 return T def around_grid_posi(num_points, scale, right_axis, up_axis, front_axis, sign=1, up_sign=1): device = scale.device indexing = 'xy' h = scale[up_axis] l = scale[right_axis] w = scale[front_axis] total_area = 2 * (l * h + h * w) ratio = (num_points / total_area).sqrt() h_points = torch.round(h * ratio).int() l_points = torch.round(l * ratio).int() w_points = torch.round(w * ratio).int() total_points = [] h_coord = torch.arange(start=-1, end=1, step=2 / h_points, device=device) * up_sign step = 2 / l_points st = -1 if sign == 1 else -1 + step l_coord = torch.arange(start=st, end=1, step=step, device=device) # * sign grid_l, grid_h = torch.meshgrid([l_coord, h_coord], indexing=indexing) lh = torch.stack([grid_l.flatten(), grid_h.flatten()], dim=1) points = torch.ones([lh.shape[0], 3], dtype=torch.float32, device=device) * 1 points[:, [right_axis, up_axis]] = lh total_points.append(points) # back step = - 2 / l_points st = 1 if sign == 1 else 1 + step l_coord = torch.arange(start=st, end=-1, step=step, device=device) # * sign grid_l, grid_h = torch.meshgrid([l_coord, h_coord], indexing=indexing) lh = torch.stack([grid_l.flatten(), grid_h.flatten()], dim=1) points = torch.ones([lh.shape[0], 3], dtype=torch.float32, device=device) * -1 points[:, [right_axis, up_axis]] = lh total_points.append(points) # right step = - 2 / w_points st = 1 if sign == 1 else 1 + step w_coord = torch.arange(start=st, end=-1, step=step, device=device) grid_h, grid_w = torch.meshgrid([h_coord, w_coord], indexing=indexing) hw = torch.stack([grid_h.flatten(), grid_w.flatten()], dim=1) points = torch.ones([hw.shape[0], 3], dtype=torch.float32, device=device) * 1 points[:, [up_axis, front_axis]] = hw total_points.append(points) # left step = 2 / w_points st = -1 if sign == 1 else -1 + step w_coord = torch.arange(start=st, end=1, step = step, device=device) grid_h, grid_w = torch.meshgrid([h_coord, w_coord], indexing=indexing) hw = torch.stack([grid_h.flatten(), grid_w.flatten()], dim=1) points = torch.ones([hw.shape[0], 3], dtype=torch.float32, device=device) * -1 points[:, [up_axis, front_axis]] = hw total_points.append(points) points = torch.cat(total_points, 0) return points def up_grid_posi(num_points, scale, right_axis, up_axis, front_axis): h = scale[up_axis] l = scale[right_axis] w = scale[front_axis] total_area = l * w ratio = math.sqrt(num_points / total_area) l_points = torch.round(l * ratio).int() w_points = torch.round(w * ratio).int() # up l_coord = torch.linspace(start=-1, end=1, steps=l_points) # * 0.9 w_coord = torch.linspace(start=-1, end=1, steps=w_points) # * 0.9 grid_l, grid_w = torch.meshgrid([l_coord, w_coord], indexing='xy') lw = torch.stack([grid_l.flatten(), grid_w.flatten()], dim=1) points = torch.ones([lw.shape[0], 3], dtype=torch.float32) * 1 points[:, [right_axis, front_axis]] = lw return points def grid_camera(trans, scale, opengl=False): trans = check_tensor(trans) scale = check_tensor(scale) device = trans.device xyz = torch.tensor( [ [-1, -1, -1], [1, 1, 1], [-1, 1, 1], [1, -1, -1], [-1, 1, -1], [1, -1, 1], [1, 1, -1], [-1, -1, 1], ], dtype=torch.float32, device=device ) if target is None: target = check_tensor(target) target = torch.zeros([1, 3], dtype=torch.float32, device=device) xyz = inv_normalize_pts(xyz, trans, scale) target = inv_normalize_pts(target, trans, scale) T = torch.zeros((xyz.shape[0], 4, 4), device=device) # w2c R = look_at(xyz, target, opengl) # w2c T[:, :3, :3] = R T[:, :3, 3] = - (R @ xyz[..., None]).squeeze(-1) # w2c T[:, 3, 3] = 1 return T def sample_cameras(model, n, up=False, around=True, look_mode='target', sample_mode='grid', bidirect=True): cam_height = None w2cs = bb_camera(n, model.trans, model.scale, cam_height, up=up, around=around, \ look_mode=look_mode, sample_mode=sample_mode, bidirect=bidirect) # traincam = self.scene.getTrainCameras()[0] # FoVx = traincam.FoVx # 1.3990553440909452 # FoVy = traincam.FoVy # 0.8764846384037163 # width = traincam.image_width # 1500 # height = traincam.image_height # 835 FoVx = FoVy = 2.5 # 3.14 / 2 width = height = 1500 cams = [] for i in range(w2cs.shape[0]): w2c = w2cs[i] cam = SampleCam(w2c, width, height, FoVx, FoVy) cams.append(cam) return cams class OrbitCamera: def __init__(self, W, H, r=2, fovy=60, near=0.01, far=100): self.W = W self.H = H self.radius = r # camera distance from center self.fovy = np.deg2rad(fovy) # deg 2 rad self.near = near self.far = far self.center = np.array([0, 0, 0], dtype=np.float32) # look at this point self.rot = R.from_matrix(np.eye(3)) self.up = np.array([0, 1, 0], dtype=np.float32) # need to be normalized! @property def fovx(self): return 2 * np.arctan(np.tan(self.fovy / 2) * self.W / self.H) @property def campos(self): return self.pose[:3, 3] # pose (c2w) @property def pose(self): # first move camera to radius res = np.eye(4, dtype=np.float32) res[2, 3] = self.radius # opengl convention... # rotate rot = np.eye(4, dtype=np.float32) rot[:3, :3] = self.rot.as_matrix() res = rot @ res # translate res[:3, 3] -= self.center return res # view (w2c) @property def view(self): return np.linalg.inv(self.pose) # projection (perspective) @property def perspective(self): y = np.tan(self.fovy / 2) aspect = self.W / self.H return np.array( [ [1 / (y * aspect), 0, 0, 0], [0, -1 / y, 0, 0], [ 0, 0, -(self.far + self.near) / (self.far - self.near), -(2 * self.far * self.near) / (self.far - self.near), ], [0, 0, -1, 0], ], dtype=np.float32, ) # intrinsics @property def intrinsics(self): focal = self.H / (2 * np.tan(self.fovy / 2)) return np.array([focal, focal, self.W // 2, self.H // 2], dtype=np.float32) @property def mvp(self): return self.perspective @ np.linalg.inv(self.pose) # [4, 4] def orbit(self, dx, dy): # rotate along camera up/side axis! side = self.rot.as_matrix()[:3, 0] rotvec_x = self.up * np.radians(-0.05 * dx) rotvec_y = side * np.radians(-0.05 * dy) self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot def scale(self, delta): self.radius *= 1.1 ** (-delta) def pan(self, dx, dy, dz=0): # pan in camera coordinate system (careful on the sensitivity!) self.center += 0.0005 * self.rot.as_matrix()[:3, :3] @ np.array([-dx, -dy, dz]) ================================================ FILE: tools/crop_mesh.py ================================================ import os import argparse import numpy as np import trimesh def align_gt_with_cam(pts, trans): trans_inv = np.linalg.inv(trans) pts_aligned = pts @ trans_inv[:3, :3].transpose(-1, -2) + trans_inv[:3, -1] return pts_aligned def filter_largest_cc(mesh): components = mesh.split(only_watertight=False) areas = np.array([c.area for c in components], dtype=float) if len(areas) > 0 and mesh.vertices.shape[0] > 0: new_mesh = components[areas.argmax()] else: new_mesh = trimesh.Trimesh() return new_mesh def main(args): assert os.path.exists(args.ply_path), f"PLY file {args.ply_path} does not exist." gt_trans = np.loadtxt(args.align_path) mesh_rec = trimesh.load(args.ply_path, process=False) mesh_gt = trimesh.load(args.gt_path, process=False) mesh_gt.vertices = align_gt_with_cam(mesh_gt.vertices, gt_trans) to_align, _ = trimesh.bounds.oriented_bounds(mesh_gt) mesh_gt.vertices = (to_align[:3, :3] @ mesh_gt.vertices.T + to_align[:3, 3:]).T mesh_rec.vertices = (to_align[:3, :3] @ mesh_rec.vertices.T + to_align[:3, 3:]).T min_points = mesh_gt.vertices.min(axis=0) max_points = mesh_gt.vertices.max(axis=0) mask_min = (mesh_rec.vertices - min_points[None]) > 0 mask_max = (mesh_rec.vertices - max_points[None]) < 0 mask = np.concatenate((mask_min, mask_max), axis=1).all(axis=1) face_mask = mask[mesh_rec.faces].all(axis=1) mesh_rec.update_vertices(mask) mesh_rec.update_faces(face_mask) mesh_rec.vertices = (to_align[:3, :3].T @ mesh_rec.vertices.T - to_align[:3, :3].T @ to_align[:3, 3:]).T mesh_gt.vertices = (to_align[:3, :3].T @ mesh_gt.vertices.T - to_align[:3, :3].T @ to_align[:3, 3:]).T # save mesh_rec and mesh_rec in args.out_path mesh_rec.export(args.out_path) # downsample mesh_gt idx = np.random.choice(np.arange(len(mesh_gt.vertices)), 5000000) mesh_gt.vertices = mesh_gt.vertices[idx] mesh_gt.colors = mesh_gt.colors[idx] mesh_gt.export(args.gt_path.replace('.ply', '_trans.ply')) return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "--gt_path", type=str, default='/your/path//Barn_GT.ply', help="path to a dataset/scene directory containing X.json, X.ply, ...", ) parser.add_argument( "--align_path", type=str, default='/your/path//Barn_trans.txt', help="path to a dataset/scene directory containing X.json, X.ply, ...", ) parser.add_argument( "--ply_path", type=str, default='/your/path//Barn_lowres.ply', help="path to reconstruction ply file", ) parser.add_argument( "--scene", type=str, default='Barn', help="path to reconstruction ply file", ) parser.add_argument( "--out_path", type=str, default='/your/path//Barn_lowres_crop.ply', help= "output directory, default: an evaluation directory is created in the directory of the ply file", ) args = parser.parse_args() main(args) ================================================ FILE: tools/denoise_pcd.py ================================================ from pytorch3d.ops import ball_query, knn_points def remove_radius_outlier(xyz, nb_points=5, radius=0.1): if xyz.dim() == 2: xyz = xyz[None] nn_dists, nn_idx, nn = ball_query(xyz, xyz, K=nb_points+1, radius=radius) valid = ~(nn_idx[0]==-1).any(-1) return valid def remove_statistical_outlier(xyz, nb_points=20, std_ratio=20.): if xyz.dim() == 2: xyz = xyz[None] nn_dists, nn_idx, nn = knn_points(xyz, xyz, K=nb_points, return_sorted=False) # Compute distances to neighbors distances = nn_dists.squeeze(0) # Shape: (N, nb_neighbors) # Compute mean and standard deviation of distances mean_distances = distances.mean(dim=-1) std_distances = distances.std(dim=-1) # Identify points that are not outliers threshold = mean_distances + std_ratio * std_distances valid = (distances <= threshold.unsqueeze(1)).any(dim=1) return valid if __name__ == '__main__': import torch import time gpu = 0 device = torch.device('cuda:{:d}'.format(gpu) if torch.cuda.is_available() else 'cpu') t1 = time.time() xyz = torch.rand(int(1e7), 3).to(device) remove_statistical_outlier(xyz) print('time:', time.time()-t1, 's') ================================================ FILE: tools/depth2mesh.py ================================================ import os import sys import math import torch import argparse import numpy as np import open3d as o3d import open3d.core as o3c sys.path.append(os.getcwd()) from configs.config import Config from gaussian_renderer import render from scene import Scene, GaussianModel from tools.semantic_id import BACKGROUND from tools.graphics_utils import depth2point from tools.general_utils import set_random_seed from tools.math_utils import get_inside_normalized from tools.mesh_utils import GaussianExtractor, post_process_mesh @torch.no_grad() def tsdf_fusion(args, cfg, model, cameras, dirs, bg, outdir, mesh_name='fused_mesh.ply', max_depth=5.0): o3d_device = o3d.core.Device("CUDA:0") vbg = o3d.t.geometry.VoxelBlockGrid( attr_names=('tsdf', 'weight', 'color'), attr_dtypes=(o3c.float32, o3c.float32, o3c.float32), attr_channels=((1), (1), (3)), voxel_size=args.voxel_size, block_resolution=16, block_count=60000, device=o3d_device) with torch.no_grad(): for _, view in enumerate(cameras): render_pkg = render(view, model, cfg, bg, dirs=dirs) if args.depth_mode == 'mean': depth = render_pkg["depth"] elif args.depth_mode == 'median': depth = render_pkg["median_depth"] rgb = render_pkg["render"] alpha = render_pkg["alpha"] if view.gt_alpha_mask is not None: depth[(view.gt_alpha_mask < 0.5)] = 0 depth[(alpha < args.alpha_thres)] = 0 rendered_pcd_world = depth2point(depth[0], view.intr, view.world_view_transform.transpose(0, 1))[1] inside = get_inside_normalized(rendered_pcd_world.view(-1, 3), model.trans, model.scale)[0] depth.view(-1)[~inside] = 0 if 'render_sem' in render_pkg: semantic = render_pkg["render_sem"] prob = model.logits2prob(semantic) mask = (prob[..., BACKGROUND] > args.prob_thres)[None] depth[mask] = 0 intrinsic=o3d.camera.PinholeCameraIntrinsic(width=view.image_width, height=view.image_height, cx = view.image_width/2, cy = view.image_height/2, fx = view.image_width / (2 * math.tan(view.FoVx / 2.)), fy = view.image_height / (2 * math.tan(view.FoVy / 2.))) extrinsic = np.asarray((view.world_view_transform.T).cpu().numpy()) rgb = rgb.clamp(0, 1) o3d_color = o3d.t.geometry.Image(np.asarray(rgb.permute(1,2,0).cpu().numpy(), order="C")) o3d_depth = o3d.t.geometry.Image(np.asarray(depth.permute(1,2,0).cpu().numpy(), order="C")) o3d_color = o3d_color.to(o3d_device) o3d_depth = o3d_depth.to(o3d_device) intrinsic = o3d.core.Tensor(intrinsic.intrinsic_matrix, o3d.core.Dtype.Float64)#.to(o3d_device) extrinsic = o3d.core.Tensor(extrinsic, o3d.core.Dtype.Float64)#.to(o3d_device) frustum_block_coords = vbg.compute_unique_block_coordinates( o3d_depth, intrinsic, extrinsic, 1.0, max_depth) vbg.integrate(frustum_block_coords, o3d_depth, o3d_color, intrinsic, intrinsic, extrinsic, 1.0, max_depth) mesh = vbg.extract_triangle_mesh().to_legacy() # write mesh o3d.io.write_triangle_mesh(os.path.join(outdir, mesh_name), mesh) # Clean Mesh if args.clean: import pymeshlab ms = pymeshlab.MeshSet() ms.load_new_mesh(os.path.join(outdir, mesh_name)) ms.meshing_remove_unreferenced_vertices() ms.meshing_remove_duplicate_faces() ms.meshing_remove_null_faces() ms.meshing_remove_connected_component_by_face_number(mincomponentsize=20000) ms.save_current_mesh(os.path.join(outdir, mesh_name)) with open(os.path.join(outdir, 'voxel_size.txt'), 'w') as f: f.write(f'voxel_size: {args.voxel_size}') def tsdf_cpu(args, cfg, model, cameras, dirs, bg, outdir, mesh_name='fused_mesh.ply', max_depth=5.0): gaussExtractor = GaussianExtractor(model, render, cfg, bg_color=bg, dirs=dirs, prob_thres=args.prob_thres, alpha_thres=args.alpha_thres) gaussExtractor.gaussians.active_sh_degree = 0 gaussExtractor.reconstruction(cameras) # extract the mesh and save if args.unbounded: mesh = gaussExtractor.extract_mesh_unbounded(resolution=args.mesh_res) else: mesh = gaussExtractor.extract_mesh_bounded(voxel_size=args.voxel_size, sdf_trunc=5*args.voxel_size, depth_trunc=max_depth) o3d.io.write_triangle_mesh(os.path.join(outdir, mesh_name), mesh) print("mesh saved at {}".format(os.path.join(outdir, mesh_name))) # post-process the mesh and save, saving the largest N clusters mesh_post = post_process_mesh(mesh, cluster_to_keep=args.num_cluster) o3d.io.write_triangle_mesh(os.path.join(outdir, mesh_name), mesh_post) return def main(args): cfg = Config(args.cfg_path) cfg.model.data_device = 'cpu' cfg.model.load_normal = False cfg.model.load_mask = False args.voxel_size = cfg.model.mesh.voxel_size if args.voxel_size == 0 else args.voxel_size set_random_seed(cfg.seed) model = GaussianModel(cfg.model) scene = Scene(cfg.model, model, load_iteration=-1, shuffle=False) model.trans = torch.from_numpy(scene.trans).cuda() model.scale = torch.from_numpy(scene.scale).cuda() * 1.1 model.extent = scene.cameras_extent cameras = scene.getTrainCameras().copy()[::args.split] model.training_setup(cfg.optim) model.max_radii2D = torch.zeros((model.get_xyz.shape[0]), device="cuda") model.scale = torch.from_numpy(scene.scale).cuda() model.prune_outliers() bg_color = [1, 1, 1] if cfg.model.white_background else [0, 0, 0] background = torch.tensor(bg_color, dtype=torch.float32, device="cuda") print(f'Fusing into {args.mesh_name} vs: {args.voxel_size}...') if args.method == 'tsdf': dirs = scene.dirs max_depth = (model.scale ** 2).sum().sqrt().item() max_depth = args.max_depth tsdf_fusion(args, cfg, model, cameras, dirs, background, cfg.logdir, args.mesh_name, max_depth) elif args.method == 'tsdf_cpu': dirs = scene.dirs max_depth = args.max_depth tsdf_cpu(args, cfg, model, cameras, dirs, background, cfg.logdir, args.mesh_name, max_depth) return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='Barn') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--mesh_name', type=str, default='vcr_gaus.ply') parser.add_argument('--scene', type=str, default='Barn') parser.add_argument('--data_path', type=str, default='Barn') parser.add_argument('--method', type=str, default='tsdf', choices=['tsdf', 'point2mesh', 'tsdf_cpu']) parser.add_argument('--depth_mode', type=str, default='mean', choices=['mean', 'median']) parser.add_argument('--rec_method', type=str, default='poisson', choices=['nksr', 'poisson']) parser.add_argument('--split', type=int, default=3) parser.add_argument('--resolution', type=float, default=1.0) parser.add_argument('--detail_level', type=float, default=1.0) parser.add_argument('--voxel_size', type=float, default=5e-3) parser.add_argument('--sdf_trunc', type=float, default=0.08) parser.add_argument('--alpha_thres', type=float, default=0.5) parser.add_argument('--prob_thres', type=float, default=0.15) parser.add_argument('--mise_iter', type=int, default=1) parser.add_argument('--depth', type=int, default=9) parser.add_argument('--max_depth', type=float, default=6.0) parser.add_argument('--est_normal', action='store_true') parser.add_argument('--cfg_path', type=str, default='configs/config_base.yaml') parser.add_argument('--clean', action='store_true', help='perform a clean operation') parser.add_argument("--unbounded", action="store_true", help='Mesh: using unbounded mode for meshing') parser.add_argument("--num_cluster", default=1000, type=int, help='Mesh: number of connected clusters to export') args = parser.parse_args() main(args) ================================================ FILE: tools/distributed.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import functools import ctypes import torch import torch.distributed as dist from contextlib import contextmanager def init_dist(local_rank, backend='nccl', **kwargs): r"""Initialize distributed training""" if dist.is_available(): if dist.is_initialized(): return torch.cuda.current_device() torch.cuda.set_device(local_rank) dist.init_process_group(backend=backend, init_method='env://', **kwargs) # Increase the L2 fetch granularity for faster speed. _libcudart = ctypes.CDLL('libcudart.so') # Set device limit on the current device # cudaLimitMaxL2FetchGranularity = 0x05 pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) def get_rank(): r"""Get rank of the thread.""" rank = 0 if dist.is_available(): if dist.is_initialized(): rank = dist.get_rank() return rank def get_world_size(): r"""Get world size. How many GPUs are available in this job.""" world_size = 1 if dist.is_available(): if dist.is_initialized(): world_size = dist.get_world_size() return world_size def broadcast_object_list(message, src=0): r"""Broadcast object list from the master to the others""" # Send logdir from master to all workers. if dist.is_available(): if dist.is_initialized(): torch.distributed.broadcast_object_list(message, src=src) return message def master_only(func): r"""Apply this function only to the master GPU.""" @functools.wraps(func) def wrapper(*args, **kwargs): r"""Simple function wrapper for the master function""" if get_rank() == 0: return func(*args, **kwargs) else: return None return wrapper def is_master(): r"""check if current process is the master""" return get_rank() == 0 def is_dist(): return dist.is_initialized() def barrier(): if is_dist(): dist.barrier() @contextmanager def master_first(): if not is_master(): barrier() yield if dist.is_initialized() and is_master(): barrier() def is_local_master(): return torch.cuda.current_device() == 0 @master_only def master_only_print(*args): r"""master-only print""" print(*args) def dist_reduce_tensor(tensor, rank=0, reduce='mean'): r""" Reduce to rank 0 """ world_size = get_world_size() if world_size < 2: return tensor with torch.no_grad(): dist.reduce(tensor, dst=rank) if get_rank() == rank: if reduce == 'mean': tensor /= world_size elif reduce == 'sum': pass else: raise NotImplementedError return tensor def dist_all_reduce_tensor(tensor, reduce='mean'): r""" Reduce to all ranks """ world_size = get_world_size() if world_size < 2: return tensor with torch.no_grad(): dist.all_reduce(tensor) if reduce == 'mean': tensor /= world_size elif reduce == 'sum': pass else: raise NotImplementedError return tensor def dist_all_gather_tensor(tensor): r""" gather to all ranks """ world_size = get_world_size() if world_size < 2: return [tensor] tensor_list = [ torch.ones_like(tensor) for _ in range(dist.get_world_size())] with torch.no_grad(): dist.all_gather(tensor_list, tensor) return tensor_list ================================================ FILE: tools/general_utils.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import torch import sys from datetime import datetime import numpy as np import random import torchvision.transforms.functional as torchvision_F from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True def inverse_sigmoid(x): return torch.log(x/(1-x)) def PILtoTorch(pil_image, resolution): resized_image_PIL = pil_image.resize(resolution) resized_image = torch.from_numpy(np.array(resized_image_PIL)) if len(resized_image.shape) == 3: return resized_image.permute(2, 0, 1) else: return resized_image.unsqueeze(dim=-1).permute(2, 0, 1) def NumpytoTorch(image, resolution): image = torch.from_numpy(image) if image.ndim == 4: image = image.squeeze(0) if image.shape[-1] == 3 or image.shape[-1] == 1: image = image.permute(2, 0, 1) _, orig_h, orig_w = image.shape if resolution == [orig_h, orig_w]: resized_image = image else: resized_image = torchvision_F.resize(image, resolution, antialias=True) return resized_image def get_expon_lr_func( lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000 ): """ Copied from Plenoxels Continuous learning rate decay function. Adapted from JaxNeRF The returned rate is lr_init when step=0 and lr_final when step=max_steps, and is log-linearly interpolated elsewhere (equivalent to exponential decay). If lr_delay_steps>0 then the learning rate will be scaled by some smooth function of lr_delay_mult, such that the initial learning rate is lr_init*lr_delay_mult at the beginning of optimization but will be eased back to the normal learning rate when steps>lr_delay_steps. :param conf: config subtree 'lr' or similar :param max_steps: int, the number of steps during optimization. :return HoF which takes step as input """ def helper(step): if step < 0 or (lr_init == 0.0 and lr_final == 0.0): # Disable this parameter return 0.0 if lr_delay_steps > 0: # A kind of reverse cosine decay. delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin( 0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1) ) else: delay_rate = 1.0 t = np.clip(step / max_steps, 0, 1) log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t) return delay_rate * log_lerp return helper def strip_lowerdiag(L): uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda") uncertainty[:, 0] = L[:, 0, 0] uncertainty[:, 1] = L[:, 0, 1] uncertainty[:, 2] = L[:, 0, 2] uncertainty[:, 3] = L[:, 1, 1] uncertainty[:, 4] = L[:, 1, 2] uncertainty[:, 5] = L[:, 2, 2] return uncertainty def strip_symmetric(sym): return strip_lowerdiag(sym) def build_rotation(r): norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3]) q = r / norm[:, None] R = torch.zeros((q.size(0), 3, 3), device='cuda') r = q[:, 0] x = q[:, 1] y = q[:, 2] z = q[:, 3] R[:, 0, 0] = 1 - 2 * (y*y + z*z) R[:, 0, 1] = 2 * (x*y - r*z) R[:, 0, 2] = 2 * (x*z + r*y) R[:, 1, 0] = 2 * (x*y + r*z) R[:, 1, 1] = 1 - 2 * (x*x + z*z) R[:, 1, 2] = 2 * (y*z - r*x) R[:, 2, 0] = 2 * (x*z - r*y) R[:, 2, 1] = 2 * (y*z + r*x) R[:, 2, 2] = 1 - 2 * (x*x + y*y) return R def build_scaling_rotation(s, r): L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda") R = build_rotation(r) L[:,0,0] = s[:,0] L[:,1,1] = s[:,1] L[:,2,2] = s[:,2] L = R @ L return L def safe_state(silent): old_f = sys.stdout class F: def __init__(self, silent): self.silent = silent def write(self, x): if not self.silent: if x.endswith("\n"): old_f.write(x.replace("\n", " [{}]\n".format(str(datetime.now().strftime("%d/%m %H:%M:%S"))))) else: old_f.write(x) def flush(self): old_f.flush() sys.stdout = F(silent) def set_random_seed(seed): r"""Set random seeds for everything, including random, numpy, torch.manual_seed, torch.cuda_manual_seed. torch.cuda.manual_seed_all is not necessary (included in torch.manual_seed) Args: seed (int): Random seed. """ print(f"Using random seed {seed}") random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # sets seed on the current CPU & all GPUs torch.cuda.manual_seed(seed) # sets seed on current GPU # torch.cuda.manual_seed_all(seed) # included in torch.manual_seed torch.cuda.set_device(torch.device("cuda:0")) ================================================ FILE: tools/graphics_utils.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import torch import math import numpy as np from typing import NamedTuple class BasicPointCloud(NamedTuple): points : np.array colors : np.array normals : np.array def geom_transform_points(points, transf_matrix): P, _ = points.shape ones = torch.ones(P, 1, dtype=points.dtype, device=points.device) points_hom = torch.cat([points, ones], dim=1) points_out = torch.matmul(points_hom, transf_matrix.unsqueeze(0)) denom = points_out[..., 3:] + 0.0000001 return (points_out[..., :3] / denom).squeeze(dim=0) def getWorld2View(R, t): Rt = np.zeros((4, 4)) Rt[:3, :3] = R.transpose() Rt[:3, 3] = t Rt[3, 3] = 1.0 return np.float32(Rt) def getWorld2View2(R, t, translate=np.array([.0, .0, .0]), scale=1.0): Rt = np.zeros((4, 4)) # w2c Rt[:3, :3] = R.transpose() # w2c Rt[:3, 3] = t # w2c Rt[3, 3] = 1.0 C2W = np.linalg.inv(Rt) # c2w cam_center = C2W[:3, 3] cam_center = (cam_center + translate) * scale C2W[:3, 3] = cam_center Rt = np.linalg.inv(C2W) # w2c return np.float32(Rt) def getView2World(R, t): ''' R: w2c t: w2c ''' Rt = np.zeros((4, 4)) Rt[:3, :3] = R.transpose() # c2w Rt[:3, 3] = -R.transpose() @ t # c2w Rt[3, 3] = 1.0 return Rt def getProjectionMatrix(znear, zfar, fovX, fovY): ''' normalized intrinsics ''' tanHalfFovY = math.tan((fovY / 2)) tanHalfFovX = math.tan((fovX / 2)) top = tanHalfFovY * znear bottom = -top right = tanHalfFovX * znear left = -right P = torch.zeros(4, 4) z_sign = 1.0 P[0, 0] = 2.0 * znear / (right - left) P[1, 1] = 2.0 * znear / (top - bottom) P[0, 2] = (right + left) / (right - left) P[1, 2] = (top + bottom) / (top - bottom) P[3, 2] = z_sign P[2, 2] = z_sign * zfar / (zfar - znear) P[2, 3] = -(zfar * znear) / (zfar - znear) return P def getIntrinsic(fovX, fovY, h, w): focal_length_y = fov2focal(fovY, h) focal_length_x = fov2focal(fovX, w) intrinsic = np.eye(3) intrinsic = torch.eye(3, dtype=torch.float32) intrinsic[0, 0] = focal_length_x # FovX intrinsic[1, 1] = focal_length_y # FovY intrinsic[0, 2] = w / 2 intrinsic[1, 2] = h / 2 return intrinsic def fov2focal(fov, pixels): return pixels / (2 * math.tan(fov / 2)) def focal2fov(focal, pixels): return 2*math.atan(pixels/(2*focal)) def ndc_2_cam(ndc_xyz, intrinsic, W, H): inv_scale = torch.tensor([[W - 1, H - 1]], device=ndc_xyz.device) cam_z = ndc_xyz[..., 2:3] cam_xy = ndc_xyz[..., :2] * inv_scale * cam_z cam_xyz = torch.cat([cam_xy, cam_z], dim=-1) cam_xyz = cam_xyz @ torch.inverse(intrinsic[0, ...].t()) return cam_xyz def depth2point_cam(sampled_depth, ref_intrinsic): B, N, C, H, W = sampled_depth.shape valid_z = sampled_depth valid_x = torch.arange(W, dtype=torch.float32, device=sampled_depth.device).add_(0.5) / (W - 1) valid_y = torch.arange(H, dtype=torch.float32, device=sampled_depth.device).add_(0.5) / (H - 1) valid_y, valid_x = torch.meshgrid(valid_y, valid_x, indexing='ij') # B,N,H,W valid_x = valid_x[None, None, None, ...].expand(B, N, C, -1, -1) valid_y = valid_y[None, None, None, ...].expand(B, N, C, -1, -1) ndc_xyz = torch.stack([valid_x, valid_y, valid_z], dim=-1).view(B, N, C, H, W, 3) # 1, 1, 5, 512, 640, 3 cam_xyz = ndc_2_cam(ndc_xyz, ref_intrinsic, W, H) # 1, 1, 5, 512, 640, 3 return ndc_xyz, cam_xyz def depth2point(depth_image, intrinsic_matrix, extrinsic_matrix): _, xyz_cam = depth2point_cam(depth_image[None,None,None,...], intrinsic_matrix[None,...]) xyz_cam = xyz_cam.reshape(-1,3) xyz_world = torch.cat([xyz_cam, torch.ones_like(xyz_cam[...,0:1])], axis=-1) @ torch.inverse(extrinsic_matrix).transpose(0,1) xyz_world = xyz_world[...,:3] return xyz_cam.reshape(*depth_image.shape, 3), xyz_world.reshape(*depth_image.shape, 3) @torch.no_grad() def get_all_px_dir(intrinsics, height, width): """ # Calculate the view direction for all pixels/rays in the image. # This is used for intersection calculation between ray and voxel textures. # """ a, ray_dir = depth2point_cam(torch.ones(1, 1, 1, height, width).cuda(), intrinsics[None]) a, ray_dir = a.squeeze(), ray_dir.squeeze() ray_dir = torch.nn.functional.normalize(ray_dir, dim=-1) ray_dir = ray_dir.permute(2, 0, 1) # 3, H, W return ray_dir ================================================ FILE: tools/image_utils.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # import torch def mse(img1, img2): return (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True) def psnr(img1, img2): mse = (((img1 - img2)) ** 2).view(img1.shape[0], -1).mean(1, keepdim=True) return 20 * torch.log10(1.0 / torch.sqrt(mse)) ================================================ FILE: tools/loss_utils.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # """ [1] Feature Preserving Point Set Surfaces based on Non-Linear Kernel Regression Cengiz Oztireli, Gaël Guennebaud, Markus Gross [2] Consolidation of Unorganized Point Clouds for Surface Reconstruction Hui Huang, Dan Li, Hao Zhang, Uri Ascher Daniel Cohen-Or [3] Differentiable Surface Splatting for Point-based Geometry Processing Wang Yifan, Felice Serena, Shihao Wu, Cengiz Oeztireli, Olga Sorkine-Hornung [4] 3D Gaussian Splatting for Real-Time Radiance Field Rendering Bernhard Kerbl, Georgios Kopanas, Thomas Leimkühler, George Drettakis """ from typing import Optional from math import exp import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable def entropy_loss(opacity): loss = (- opacity * torch.log(opacity + 1e-6) - \ (1 - opacity) * torch.log(1 - opacity + 1e-6)).mean() return loss def l1_loss(network_output, gt): return torch.abs((network_output - gt)).mean() def log_l1_loss(network_output, gt): loss = torch.log(1 + torch.abs((network_output - gt))).mean() return loss def l2_loss(network_output, gt): return ((network_output - gt) ** 2).mean() def gaussian(window_size, sigma): gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)]) return gauss / gauss.sum() def create_window(window_size, channel): _1D_window = gaussian(window_size, 1.5).unsqueeze(1) _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous()) return window def ssim(img1, img2, window_size=11, size_average=True): channel = img1.size(-3) window = create_window(window_size, channel) if img1.is_cuda: window = window.cuda(img1.get_device()) window = window.type_as(img1) return _ssim(img1, img2, window, window_size, channel, size_average) def _ssim(img1, img2, window, window_size, channel, size_average=True): mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) mu1_sq = mu1.pow(2) mu2_sq = mu2.pow(2) mu1_mu2 = mu1 * mu2 sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 C1 = 0.01 ** 2 C2 = 0.03 ** 2 ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) if size_average: return ssim_map.mean() else: return ssim_map.mean(1).mean(1).mean(1) def eikonal_loss(gradients): gradient_error = (gradients.norm(dim=-1) - 1.0) ** 2 # [B,R,N] gradient_error = gradient_error.nan_to_num(nan=0.0, posinf=0.0, neginf=0.0) # [B,R,N] return gradient_error.mean() def curvature_loss(hessian): laplacian = hessian.sum(dim=-1).abs() # [B,R,N] laplacian = laplacian.nan_to_num(nan=0.0, posinf=0.0, neginf=0.0) # [B,R,N] return laplacian.mean() def compute_normal_loss(normal_pred, normal_gt, weight=None): if weight is not None: weight = weight.view(-1, 1) else: weight = 1.0 normal_pred = normal_pred.view(-1, 3) normal_gt = normal_gt.view(-1, 3) cos = (1.0 - torch.sum(normal_pred * normal_gt * weight, dim=-1).abs()).mean() return cos def monosdf_normal_loss(normal_pred: torch.Tensor, normal_gt: torch.Tensor, weight: Optional[torch.Tensor] = None): """normal consistency loss as monosdf Args: normal_pred (torch.Tensor): volume rendered normal normal_gt (torch.Tensor): monocular normal """ if weight is None: weight = 1.0 l1 = (weight * torch.abs(normal_pred - normal_gt).sum(dim=-1)).mean() cos = (weight * (1.0 - torch.sum(normal_pred * normal_gt, dim=-1))).mean() return l1 + cos def cos_weight(render_normal, gt_normal, exp_t=1.0): cos = torch.sum(render_normal * gt_normal, dim=-1) if exp_t > 0: cos = torch.exp((cos - 1) / exp_t) else: cos = torch.ones_like(cos) return cos.detach() # copy from MiDaS def compute_scale_and_shift(prediction, target, mask): # system matrix: A = [[a_00, a_01], [a_10, a_11]] a_00 = torch.sum(mask * prediction * prediction, (1, 2)) a_01 = torch.sum(mask * prediction, (1, 2)) a_11 = torch.sum(mask, (1, 2)) # right hand side: b = [b_0, b_1] b_0 = torch.sum(mask * prediction * target, (1, 2)) b_1 = torch.sum(mask * target, (1, 2)) # solution: x = A^-1 . b = [[a_11, -a_01], [-a_10, a_00]] / (a_00 * a_11 - a_01 * a_10) . b x_0 = torch.zeros_like(b_0) x_1 = torch.zeros_like(b_1) det = a_00 * a_11 - a_01 * a_01 valid = det.nonzero() x_0[valid] = (a_11[valid] * b_0[valid] - a_01[valid] * b_1[valid]) / det[valid] x_1[valid] = (-a_01[valid] * b_0[valid] + a_00[valid] * b_1[valid]) / det[valid] return x_0, x_1 def reduction_batch_based(image_loss, M): # average of all valid pixels of the batch # avoid division by 0 (if sum(M) = sum(sum(mask)) = 0: sum(image_loss) = 0) divisor = torch.sum(M) if divisor == 0: return 0 else: return torch.sum(image_loss) / divisor def reduction_image_based(image_loss, M): # mean of average of valid pixels of an image # avoid division by 0 (if M = sum(mask) = 0: image_loss = 0) valid = M.nonzero() image_loss[valid] = image_loss[valid] / M[valid] return torch.mean(image_loss) def mse_loss(prediction, target, mask, reduction=reduction_batch_based): M = torch.sum(mask, (1, 2)) res = prediction - target image_loss = torch.sum(mask * res * res, (1, 2)) return reduction(image_loss, 2 * M) def gradient_loss(prediction, target, mask, reduction=reduction_batch_based): M = torch.sum(mask, (1, 2)) diff = prediction - target diff = torch.mul(mask, diff) grad_x = torch.abs(diff[:, :, 1:] - diff[:, :, :-1]) mask_x = torch.mul(mask[:, :, 1:], mask[:, :, :-1]) grad_x = torch.mul(mask_x, grad_x) grad_y = torch.abs(diff[:, 1:, :] - diff[:, :-1, :]) mask_y = torch.mul(mask[:, 1:, :], mask[:, :-1, :]) grad_y = torch.mul(mask_y, grad_y) image_loss = torch.sum(grad_x, (1, 2)) + torch.sum(grad_y, (1, 2)) return reduction(image_loss, M) class MSELoss(nn.Module): def __init__(self, reduction='batch-based'): super().__init__() if reduction == 'batch-based': self.__reduction = reduction_batch_based else: self.__reduction = reduction_image_based def forward(self, prediction, target, mask): return mse_loss(prediction, target, mask, reduction=self.__reduction) class GradientLoss(nn.Module): def __init__(self, scales=4, reduction='batch-based'): super().__init__() if reduction == 'batch-based': self.__reduction = reduction_batch_based else: self.__reduction = reduction_image_based self.__scales = scales def forward(self, prediction, target, mask): total = 0 for scale in range(self.__scales): step = pow(2, scale) total += gradient_loss(prediction[:, ::step, ::step], target[:, ::step, ::step], mask[:, ::step, ::step], reduction=self.__reduction) return total class ScaleAndShiftInvariantLoss(nn.Module): def __init__(self, alpha=0.5, scales=1, reduction='batch-based'): super().__init__() self.__data_loss = MSELoss(reduction=reduction) self.__regularization_loss = GradientLoss(scales=scales, reduction=reduction) self.__alpha = alpha self.__prediction_ssi = None def forward(self, prediction, target, mask=None): target = target * 50 + 0.5 if mask is None: mask = torch.ones_like(target) scale, shift = compute_scale_and_shift(prediction, target, mask) self.__prediction_ssi = scale.view(-1, 1, 1) * prediction + shift.view(-1, 1, 1) total = self.__data_loss(self.__prediction_ssi, target, mask) if self.__alpha > 0: total += self.__alpha * self.__regularization_loss(self.__prediction_ssi, target, mask) return total def __get_prediction_ssi(self): return self.__prediction_ssi prediction_ssi = property(__get_prediction_ssi) # end copy def normal2curv(normal, mask = None): n = normal m = mask n = torch.nn.functional.pad(n[None], [0, 0, 1, 1, 1, 1], mode='replicate') m = torch.nn.functional.pad(m[None].to(torch.float32), [0, 0, 1, 1, 1, 1], mode='replicate').to(torch.bool) n_c = (n[:, 1:-1, 1:-1, :] ) * m[:, 1:-1, 1:-1, :] n_u = (n[:, :-2, 1:-1, :] - n_c) * m[:, :-2, 1:-1, :] n_l = (n[:, 1:-1, :-2, :] - n_c) * m[:, 1:-1, :-2, :] n_b = (n[:, 2: , 1:-1, :] - n_c) * m[:, 2: , 1:-1, :] n_r = (n[:, 1:-1, 2: , :] - n_c) * m[:, 1:-1, 2: , :] curv = (n_u + n_l + n_b + n_r)[0] curv = curv * mask curv = curv.norm(1, -1, True) return curv def L1_loss_appearance(image, gt_image, gaussians, view_idx, return_transformed_image=False): appearance_embedding = gaussians.get_apperance_embedding(view_idx) # center crop the image origH, origW = image.shape[1:] H = origH // 32 * 32 W = origW // 32 * 32 left = origW // 2 - W // 2 top = origH // 2 - H // 2 crop_image = image[:, top:top+H, left:left+W] crop_gt_image = gt_image[:, top:top+H, left:left+W] # down sample the image crop_image_down = torch.nn.functional.interpolate(crop_image[None], size=(H//32, W//32), mode="bilinear", align_corners=True)[0] crop_image_down = torch.cat([crop_image_down, appearance_embedding[None].repeat(H//32, W//32, 1).permute(2, 0, 1)], dim=0)[None] mapping_image = gaussians.appearance_network(crop_image_down) transformed_image = mapping_image * crop_image if not return_transformed_image: return l1_loss(transformed_image, crop_gt_image) else: transformed_image = torch.nn.functional.interpolate(transformed_image, size=(origH, origW), mode="bilinear", align_corners=True)[0] return transformed_image ================================================ FILE: tools/math_utils.py ================================================ import torch def eps_sqrt(squared, eps=1e-17): """ Prepare for the input for sqrt, make sure the input positive and larger than eps """ return torch.clamp(squared.abs(), eps) def ndc_to_pix(p, resolution): """ Reverse of pytorch3d pix_to_ndc function Args: p (float tensor): (..., 3) resolution (scalar): image resolution (for now, supports only aspectratio = 1) Returns: pix (long tensor): (..., 2) """ pix = resolution - ((p[..., :2] + 1.0) * resolution - 1.0) / 2 return pix def decompose_to_R_and_t(transform_mat, row_major=True): """ decompose a 4x4 transform matrix to R (3,3) and t (1,3)""" assert(transform_mat.shape[-2:] == (4, 4)), \ "Expecting batches of 4x4 matrice" # ... 3x3 if not row_major: transform_mat = transform_mat.transpose(-2, -1) R = transform_mat[..., :3, :3] t = transform_mat[..., -1, :3] return R, t def to_homogen(x, dim=-1): """ append one to the specified dimension """ if dim < 0: dim = x.ndim + dim shp = x.shape new_shp = shp[:dim] + (1, ) + shp[dim + 1:] x_homogen = x.new_ones(new_shp) x_homogen = torch.cat([x, x_homogen], dim=dim) return x_homogen def normalize_pts(pts, trans, scale): ''' trans: (4, 4), world to ''' if trans.ndim == 1: pts = (pts - trans) / scale else: pts = ((trans[:3, :3] @ pts.T + trans[:3, 3:]).T) / scale return pts def inv_normalize_pts(pts, trans, scale): if trans.ndim == 1: pts = pts * scale + trans else: pts = (pts * scale[None] - trans[:3, 3:].T) @ trans[:3, :3] return pts def get_inside_normalized(xyz, trans, scale): pts = normalize_pts(xyz, trans, scale) with torch.no_grad(): inside = torch.all(torch.abs(pts) < 1, dim=-1) return inside, pts ================================================ FILE: tools/mcube_utils.py ================================================ # # Copyright (C) 2024, ShanghaiTech # SVIP research group, https://github.com/svip-lab # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact huangbb@shanghaitech.edu.cn # import numpy as np import torch import trimesh from skimage import measure # modified from here https://github.com/autonomousvision/sdfstudio/blob/370902a10dbef08cb3fe4391bd3ed1e227b5c165/nerfstudio/utils/marching_cubes.py#L201 def marching_cubes_with_contraction( sdf, resolution=512, bounding_box_min=(-1.0, -1.0, -1.0), bounding_box_max=(1.0, 1.0, 1.0), return_mesh=False, level=0, simplify_mesh=True, inv_contraction=None, max_range=32.0, ): assert resolution % 512 == 0 resN = resolution cropN = 512 level = 0 N = resN // cropN grid_min = bounding_box_min grid_max = bounding_box_max xs = np.linspace(grid_min[0], grid_max[0], N + 1) ys = np.linspace(grid_min[1], grid_max[1], N + 1) zs = np.linspace(grid_min[2], grid_max[2], N + 1) meshes = [] for i in range(N): for j in range(N): for k in range(N): print(i, j, k) x_min, x_max = xs[i], xs[i + 1] y_min, y_max = ys[j], ys[j + 1] z_min, z_max = zs[k], zs[k + 1] x = np.linspace(x_min, x_max, cropN) y = np.linspace(y_min, y_max, cropN) z = np.linspace(z_min, z_max, cropN) xx, yy, zz = np.meshgrid(x, y, z, indexing="ij") points = torch.tensor(np.vstack([xx.ravel(), yy.ravel(), zz.ravel()]).T, dtype=torch.float).cuda() @torch.no_grad() def evaluate(points): z = [] for _, pnts in enumerate(torch.split(points, 256**3, dim=0)): z.append(sdf(pnts)) z = torch.cat(z, axis=0) return z # construct point pyramids points = points.reshape(cropN, cropN, cropN, 3) points = points.reshape(-1, 3) pts_sdf = evaluate(points.contiguous()) z = pts_sdf.detach().cpu().numpy() if not (np.min(z) > level or np.max(z) < level): z = z.astype(np.float32) verts, faces, normals, _ = measure.marching_cubes( volume=z.reshape(cropN, cropN, cropN), level=level, spacing=( (x_max - x_min) / (cropN - 1), (y_max - y_min) / (cropN - 1), (z_max - z_min) / (cropN - 1), ), ) verts = verts + np.array([x_min, y_min, z_min]) meshcrop = trimesh.Trimesh(verts, faces, normals) meshes.append(meshcrop) print("finished one block") combined = trimesh.util.concatenate(meshes) combined.merge_vertices(digits_vertex=6) # inverse contraction and clipping the points range if inv_contraction is not None: combined.vertices = inv_contraction(torch.from_numpy(combined.vertices).float().cuda()).cpu().numpy() combined.vertices = np.clip(combined.vertices, -max_range, max_range) return combined ================================================ FILE: tools/mesh_utils.py ================================================ import torch import numpy as np import os import math from tqdm import tqdm from functools import partial import open3d as o3d from tools.render_utils import save_img_f32, save_img_u8 from tools.semantic_id import BACKGROUND from tools.graphics_utils import depth2point from tools.math_utils import get_inside_normalized def post_process_mesh(mesh, cluster_to_keep=1000): """ Post-process a mesh to filter out floaters and disconnected parts """ import copy print("post processing the mesh to have {} clusterscluster_to_kep".format(cluster_to_keep)) mesh_0 = copy.deepcopy(mesh) with o3d.utility.VerbosityContextManager(o3d.utility.VerbosityLevel.Debug) as cm: triangle_clusters, cluster_n_triangles, cluster_area = (mesh_0.cluster_connected_triangles()) triangle_clusters = np.asarray(triangle_clusters) cluster_n_triangles = np.asarray(cluster_n_triangles) cluster_area = np.asarray(cluster_area) n_cluster = np.sort(cluster_n_triangles.copy())[-cluster_to_keep] n_cluster = max(n_cluster, 50) # filter meshes smaller than 50 triangles_to_remove = cluster_n_triangles[triangle_clusters] < n_cluster mesh_0.remove_triangles_by_mask(triangles_to_remove) mesh_0.remove_unreferenced_vertices() mesh_0.remove_degenerate_triangles() print("num vertices raw {}".format(len(mesh.vertices))) print("num vertices post {}".format(len(mesh_0.vertices))) return mesh_0 def to_cam_open3d(viewpoint_stack): camera_traj = [] for i, viewpoint_cam in enumerate(viewpoint_stack): intrinsic=o3d.camera.PinholeCameraIntrinsic(width=viewpoint_cam.image_width, height=viewpoint_cam.image_height, cx = viewpoint_cam.image_width/2, cy = viewpoint_cam.image_height/2, fx = viewpoint_cam.image_width / (2 * math.tan(viewpoint_cam.FoVx / 2.)), fy = viewpoint_cam.image_height / (2 * math.tan(viewpoint_cam.FoVy / 2.))) extrinsic=np.asarray((viewpoint_cam.world_view_transform.T).cpu().numpy()) camera = o3d.camera.PinholeCameraParameters() camera.extrinsic = extrinsic camera.intrinsic = intrinsic camera_traj.append(camera) return camera_traj class GaussianExtractor(object): def __init__(self, gaussians, render, cfg, bg_color=None, dirs=None, prob_thres=0.2, alpha_thres=0.5): """ a class that extracts attributes a scene presented by 2DGS Usage example: >>> gaussExtrator = GaussianExtractor(gaussians, render, pipe) >>> gaussExtrator.reconstruction(view_points) >>> mesh = gaussExtractor.export_mesh_bounded(...) """ if bg_color is None: bg_color = [0, 0, 0] if isinstance(bg_color, torch.Tensor): background = bg_color.clone().detach() else: background = torch.tensor(bg_color, dtype=torch.float32, device="cuda") self.gaussians = gaussians self.render = partial(render, cfg=cfg, bg_color=background, dirs=dirs) self.prob_thres = prob_thres self.alpha_thres = alpha_thres self.clean() @torch.no_grad() def clean(self): self.depthmaps = [] self.alphamaps = [] self.rgbmaps = [] self.normals = [] self.depth_normals = [] self.viewpoint_stack = [] @torch.no_grad() def reconstruction(self, viewpoint_stack): """ reconstruct radiance field given cameras """ self.clean() self.viewpoint_stack = viewpoint_stack for i, viewpoint_cam in tqdm(enumerate(self.viewpoint_stack), desc="reconstruct radiance fields", total=len(self.viewpoint_stack)): render_pkg = self.render(viewpoint_cam, self.gaussians) rgb = render_pkg['render'] alpha = render_pkg['alpha'] normal = torch.nn.functional.normalize(render_pkg['normal'], dim=0) normal = render_pkg['normal'].permute(1, 2, 0) depth = render_pkg['depth'] if 'render_sem' in render_pkg: semantic = render_pkg["render_sem"] prob = self.gaussians.logits2prob(semantic) mask = (prob[..., BACKGROUND] > self.prob_thres)[None] depth[mask] = 0 rendered_pcd_world = depth2point(depth[0], viewpoint_cam.intr, viewpoint_cam.world_view_transform.transpose(0, 1))[1] inside = get_inside_normalized(rendered_pcd_world.view(-1, 3), self.gaussians.trans, self.gaussians.scale)[0] depth.view(-1)[~inside] = 0 depth_normal = render_pkg['est_normal'].permute(1, 2, 0) self.rgbmaps.append(rgb.cpu()) self.depthmaps.append(depth.cpu()) self.alphamaps.append(alpha.cpu()) self.normals.append(normal.cpu()) self.depth_normals.append(depth_normal.cpu()) self.rgbmaps = torch.stack(self.rgbmaps, dim=0) self.depthmaps = torch.stack(self.depthmaps, dim=0) self.alphamaps = torch.stack(self.alphamaps, dim=0) self.depth_normals = torch.stack(self.depth_normals, dim=0) @torch.no_grad() def extract_mesh_bounded(self, voxel_size=0.004, sdf_trunc=0.02, depth_trunc=3, mask_backgrond=True): """ Perform TSDF fusion given a fixed depth range, used in the paper. voxel_size: the voxel size of the volume sdf_trunc: truncation value depth_trunc: maximum depth range, should depended on the scene's scales mask_backgrond: whether to mask backgroud, only works when the dataset have masks return o3d.mesh """ print("Running tsdf volume integration ...") print(f'voxel_size: {voxel_size}') print(f'sdf_trunc: {sdf_trunc}') print(f'depth_truc: {depth_trunc}') volume = o3d.pipelines.integration.ScalableTSDFVolume( voxel_length= voxel_size, sdf_trunc=sdf_trunc, color_type=o3d.pipelines.integration.TSDFVolumeColorType.RGB8 ) for i, cam_o3d in tqdm(enumerate(to_cam_open3d(self.viewpoint_stack)), desc="TSDF integration progress", total=len(self.viewpoint_stack)): rgb = self.rgbmaps[i] depth = self.depthmaps[i] # if we have mask provided, use it if mask_backgrond and (self.viewpoint_stack[i].gt_alpha_mask is not None): depth[(self.viewpoint_stack[i].gt_alpha_mask < 0.5)] = 0 # make open3d rgbd rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth( o3d.geometry.Image(np.asarray(rgb.permute(1,2,0).cpu().numpy() * 255, order="C", dtype=np.uint8)), o3d.geometry.Image(np.asarray(depth.permute(1,2,0).cpu().numpy(), order="C")), depth_trunc = depth_trunc, convert_rgb_to_intensity=False, depth_scale = 1.0 ) volume.integrate(rgbd, intrinsic=cam_o3d.intrinsic, extrinsic=cam_o3d.extrinsic) mesh = volume.extract_triangle_mesh() return mesh @torch.no_grad() def extract_mesh_unbounded(self, resolution=1024): """ Experimental features, extracting meshes from unbounded scenes, not fully test across datasets. #TODO: support color mesh exporting sdf_trunc: truncation value return o3d.mesh """ def contract(x): mag = torch.linalg.norm(x, ord=2, dim=-1)[..., None] return torch.where(mag < 1, x, (2 - (1 / mag)) * (x / mag)) def uncontract(y): mag = torch.linalg.norm(y, ord=2, dim=-1)[..., None] return torch.where(mag < 1, y, (1 / (2-mag) * (y/mag))) def compute_sdf_perframe(i, points, depthmap, rgbmap, normalmap, viewpoint_cam): """ compute per frame sdf """ new_points = torch.cat([points, torch.ones_like(points[...,:1])], dim=-1) @ viewpoint_cam.full_proj_transform z = new_points[..., -1:] pix_coords = (new_points[..., :2] / new_points[..., -1:]) mask_proj = ((pix_coords > -1. ) & (pix_coords < 1.) & (z > 0)).all(dim=-1) sampled_depth = torch.nn.functional.grid_sample(depthmap.cuda()[None], pix_coords[None, None], mode='bilinear', padding_mode='border', align_corners=True).reshape(-1, 1) sampled_rgb = torch.nn.functional.grid_sample(rgbmap.cuda()[None], pix_coords[None, None], mode='bilinear', padding_mode='border', align_corners=True).reshape(3,-1).T sampled_normal = torch.nn.functional.grid_sample(normalmap.cuda()[None], pix_coords[None, None], mode='bilinear', padding_mode='border', align_corners=True).reshape(3,-1).T sdf = (sampled_depth-z) return sdf, sampled_rgb, sampled_normal, mask_proj def compute_unbounded_tsdf(samples, inv_contraction, voxel_size, return_rgb=False): """ Fusion all frames, perform adaptive sdf_funcation on the contract spaces. """ if inv_contraction is not None: samples = inv_contraction(samples) mask = torch.linalg.norm(samples, dim=-1) > 1 # adaptive sdf_truncation sdf_trunc = 5 * voxel_size * torch.ones_like(samples[:, 0]) sdf_trunc[mask] *= 1/(2-torch.linalg.norm(samples, dim=-1)[mask].clamp(max=1.9)) else: sdf_trunc = 5 * voxel_size tsdfs = torch.ones_like(samples[:,0]) * 1 rgbs = torch.zeros((samples.shape[0], 3)).cuda() weights = torch.ones_like(samples[:,0]) for i, viewpoint_cam in tqdm(enumerate(self.viewpoint_stack), desc="TSDF integration progress"): sdf, rgb, normal, mask_proj = compute_sdf_perframe(i, samples, depthmap = self.depthmaps[i], rgbmap = self.rgbmaps[i], normalmap = self.depth_normals[i], viewpoint_cam=self.viewpoint_stack[i], ) # volume integration sdf = sdf.flatten() mask_proj = mask_proj & (sdf > -sdf_trunc) sdf = torch.clamp(sdf / sdf_trunc, min=-1.0, max=1.0)[mask_proj] w = weights[mask_proj] wp = w + 1 tsdfs[mask_proj] = (tsdfs[mask_proj] * w + sdf) / wp rgbs[mask_proj] = (rgbs[mask_proj] * w[:,None] + rgb[mask_proj]) / wp[:,None] # update weight weights[mask_proj] = wp if return_rgb: return tsdfs, rgbs return tsdfs from tools.render_utils import focus_point_fn torch.cuda.empty_cache() c2ws = np.array([np.linalg.inv(np.asarray((cam.world_view_transform.T).cpu().numpy())) for cam in self.viewpoint_stack]) poses = c2ws[:,:3,:] @ np.diag([1, -1, -1, 1]) center = (focus_point_fn(poses)) radius = np.linalg.norm(c2ws[:,:3,3] - center, axis=-1).min() center = torch.from_numpy(center).float().cuda() normalize = lambda x: (x - center) / radius unnormalize = lambda x: (x * radius) + center inv_contraction = lambda x: unnormalize(uncontract(x)) N = resolution voxel_size = (radius * 2 / N) print(f"Computing sdf gird resolution {N} x {N} x {N}") print(f"Define the voxel_size as {voxel_size}") sdf_function = lambda x: compute_unbounded_tsdf(x, inv_contraction, voxel_size) from tools.mcube_utils import marching_cubes_with_contraction R = contract(normalize(self.gaussians.get_xyz)).norm(dim=-1).cpu().numpy() R = np.quantile(R, q=0.95) R = min(R+0.01, 1.9) mesh = marching_cubes_with_contraction( sdf=sdf_function, bounding_box_min=(-R, -R, -R), bounding_box_max=(R, R, R), level=0, resolution=N, inv_contraction=inv_contraction, ) # coloring the mesh torch.cuda.empty_cache() mesh = mesh.as_open3d print("texturing mesh ... ") _, rgbs = compute_unbounded_tsdf(torch.tensor(np.asarray(mesh.vertices)).float().cuda(), inv_contraction=None, voxel_size=voxel_size, return_rgb=True) mesh.vertex_colors = o3d.utility.Vector3dVector(rgbs.cpu().numpy()) return mesh @torch.no_grad() def export_image(self, path): render_path = os.path.join(path, "renders") gts_path = os.path.join(path, "gt") vis_path = os.path.join(path, "vis") os.makedirs(render_path, exist_ok=True) os.makedirs(vis_path, exist_ok=True) os.makedirs(gts_path, exist_ok=True) for idx, viewpoint_cam in tqdm(enumerate(self.viewpoint_stack), desc="export images"): gt = viewpoint_cam.original_image[0:3, :, :] save_img_u8(gt.permute(1,2,0).cpu().numpy(), os.path.join(gts_path, '{0:05d}'.format(idx) + ".png")) save_img_u8(self.rgbmaps[idx].permute(1,2,0).cpu().numpy(), os.path.join(render_path, '{0:05d}'.format(idx) + ".png")) save_img_f32(self.depthmaps[idx][0].cpu().numpy(), os.path.join(vis_path, 'depth_{0:05d}'.format(idx) + ".tiff")) save_img_u8(self.normals[idx].permute(1,2,0).cpu().numpy() * 0.5 + 0.5, os.path.join(vis_path, 'normal_{0:05d}'.format(idx) + ".png")) save_img_u8(self.depth_normals[idx].permute(1,2,0).cpu().numpy() * 0.5 + 0.5, os.path.join(vis_path, 'depth_normal_{0:05d}'.format(idx) + ".png")) ================================================ FILE: tools/normal_utils.py ================================================ import torch import torch.nn.functional as F from tools.graphics_utils import depth2point_cam def get_normal_sign(normals, begin=None, end=None, trans=None, mode='origin', vec=None): if mode == 'origin': if vec is None: if begin is None: # center if trans is not None: begin = - trans[:3, :3].T @ trans[:3, 3] \ if trans.ndim != 1 else trans else: begin = end.mean(0) begin[1] += 1 vec = end - begin cos = (normals * vec).sum(-1, keepdim=True) return cos def compute_gradient(img): dy = torch.gradient(img, dim=0)[0] dx = torch.gradient(img, dim=1)[0] return dx, dy def compute_normals(depth_map, K): # Assuming depth_map is a PyTorch tensor of shape [H, W] # K_inv is the inverse of the intrinsic matrix _, cam_coords = depth2point_cam(depth_map[None, None], K[None]) cam_coords = cam_coords.squeeze(0).squeeze(0).squeeze(0) # [H, W, 3] dx, dy = compute_gradient(cam_coords) # Cross product of gradients gives normal normals = torch.cross(dx, dy, dim=-1) normals = F.normalize(normals, p=2, dim=-1) return normals def compute_edge(image, k=11, thr=0.01): dx, dy = compute_gradient(image) edge = torch.sqrt(dx**2 + dy**2) edge = edge / edge.max() p = (k - 1) // 2 edge = F.max_pool2d(edge[None], kernel_size=k, stride=1, padding=p)[0] edge[edge>thr] = 1 return edge def get_edge_aware_distortion_map(gt_image, distortion_map): grad_img_left = torch.mean(torch.abs(gt_image[:, 1:-1, 1:-1] - gt_image[:, 1:-1, :-2]), 0) grad_img_right = torch.mean(torch.abs(gt_image[:, 1:-1, 1:-1] - gt_image[:, 1:-1, 2:]), 0) grad_img_top = torch.mean(torch.abs(gt_image[:, 1:-1, 1:-1] - gt_image[:, :-2, 1:-1]), 0) grad_img_bottom = torch.mean(torch.abs(gt_image[:, 1:-1, 1:-1] - gt_image[:, 2:, 1:-1]), 0) max_grad = torch.max(torch.stack([grad_img_left, grad_img_right, grad_img_top, grad_img_bottom], dim=-1), dim=-1)[0] # pad max_grad = torch.exp(-max_grad) max_grad = torch.nn.functional.pad(max_grad, (1, 1, 1, 1), mode="constant", value=0) return distortion_map * max_grad ================================================ FILE: tools/prune.py ================================================ import torch from gaussian_renderer import count_render, visi_acc_render def calculate_v_imp_score(gaussians, imp_list, v_pow): """ :param gaussians: A data structure containing Gaussian components with a get_scaling method. :param imp_list: The importance scores for each Gaussian component. :param v_pow: The power to which the volume ratios are raised. :return: A list of adjusted values (v_list) used for pruning. """ # Calculate the volume of each Gaussian component volume = torch.prod(gaussians.get_scaling, dim=1) # Determine the kth_percent_largest value index = int(len(volume) * 0.9) sorted_volume, _ = torch.sort(volume, descending=True) kth_percent_largest = sorted_volume[index] # Calculate v_list v_list = torch.pow(volume / kth_percent_largest, v_pow) v_list = v_list * imp_list return v_list def prune_list(gaussians, viewpoint_stack, pipe, background): gaussian_list, imp_list = None, None viewpoint_cam = viewpoint_stack.pop() render_pkg = count_render(viewpoint_cam, gaussians, pipe, background) gaussian_list, imp_list = ( render_pkg["gaussians_count"], render_pkg["important_score"], ) for iteration in range(len(viewpoint_stack)): # Pick a random Camera # prunning viewpoint_cam = viewpoint_stack.pop() render_pkg = count_render(viewpoint_cam, gaussians, pipe, background) gaussians_count, important_score = ( render_pkg["gaussians_count"].detach(), render_pkg["important_score"].detach(), ) gaussian_list += gaussians_count imp_list += important_score return gaussian_list, imp_list v_render = visi_acc_render def get_visi_list(gaussians, viewpoint_stack, pipe, background): out = {} gaussian_list = None viewpoint_cam = viewpoint_stack.pop() render_pkg = v_render(viewpoint_cam, gaussians, pipe, background) gaussian_list = render_pkg["countlist"] for i in range(len(viewpoint_stack)): # Pick a random Camera # prunning viewpoint_cam = viewpoint_stack.pop() render_pkg = v_render(viewpoint_cam, gaussians, pipe, background) gaussians_count = render_pkg["countlist"].detach() gaussian_list += gaussians_count visi = gaussian_list > 0 out["visi"] = visi return out ================================================ FILE: tools/render_utils.py ================================================ # Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import os from typing import Tuple import copy from PIL import Image import mediapy as media from matplotlib import cm from tqdm import tqdm import torch def normalize(x: np.ndarray) -> np.ndarray: """Normalization helper function.""" return x / np.linalg.norm(x) def pad_poses(p: np.ndarray) -> np.ndarray: """Pad [..., 3, 4] pose matrices with a homogeneous bottom row [0,0,0,1].""" bottom = np.broadcast_to([0, 0, 0, 1.], p[..., :1, :4].shape) return np.concatenate([p[..., :3, :4], bottom], axis=-2) def unpad_poses(p: np.ndarray) -> np.ndarray: """Remove the homogeneous bottom row from [..., 4, 4] pose matrices.""" return p[..., :3, :4] def recenter_poses(poses: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Recenter poses around the origin.""" cam2world = average_pose(poses) transform = np.linalg.inv(pad_poses(cam2world)) poses = transform @ pad_poses(poses) return unpad_poses(poses), transform def average_pose(poses: np.ndarray) -> np.ndarray: """New pose using average position, z-axis, and up vector of input poses.""" position = poses[:, :3, 3].mean(0) z_axis = poses[:, :3, 2].mean(0) up = poses[:, :3, 1].mean(0) cam2world = viewmatrix(z_axis, up, position) return cam2world def viewmatrix(lookdir: np.ndarray, up: np.ndarray, position: np.ndarray) -> np.ndarray: """Construct lookat view matrix.""" vec2 = normalize(lookdir) vec0 = normalize(np.cross(up, vec2)) vec1 = normalize(np.cross(vec2, vec0)) m = np.stack([vec0, vec1, vec2, position], axis=1) return m def focus_point_fn(poses: np.ndarray) -> np.ndarray: """Calculate nearest point to all focal axes in poses.""" directions, origins = poses[:, :3, 2:3], poses[:, :3, 3:4] m = np.eye(3) - directions * np.transpose(directions, [0, 2, 1]) mt_m = np.transpose(m, [0, 2, 1]) @ m focus_pt = np.linalg.inv(mt_m.mean(0)) @ (mt_m @ origins).mean(0)[:, 0] return focus_pt def transform_poses_pca(poses: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Transforms poses so principal components lie on XYZ axes. Args: poses: a (N, 3, 4) array containing the cameras' camera to world transforms. Returns: A tuple (poses, transform), with the transformed poses and the applied camera_to_world transforms. """ t = poses[:, :3, 3] t_mean = t.mean(axis=0) t = t - t_mean eigval, eigvec = np.linalg.eig(t.T @ t) # Sort eigenvectors in order of largest to smallest eigenvalue. inds = np.argsort(eigval)[::-1] eigvec = eigvec[:, inds] rot = eigvec.T if np.linalg.det(rot) < 0: rot = np.diag(np.array([1, 1, -1])) @ rot transform = np.concatenate([rot, rot @ -t_mean[:, None]], -1) poses_recentered = unpad_poses(transform @ pad_poses(poses)) transform = np.concatenate([transform, np.eye(4)[3:]], axis=0) # Flip coordinate system if z component of y-axis is negative if poses_recentered.mean(axis=0)[2, 1] < 0: poses_recentered = np.diag(np.array([1, -1, -1])) @ poses_recentered transform = np.diag(np.array([1, -1, -1, 1])) @ transform return poses_recentered, transform def generate_ellipse_path(poses: np.ndarray, n_frames: int = 120, const_speed: bool = True, z_variation: float = 0., z_phase: float = 0.) -> np.ndarray: """Generate an elliptical render path based on the given poses.""" # Calculate the focal point for the path (cameras point toward this). center = focus_point_fn(poses) # Path height sits at z=0 (in middle of zero-mean capture pattern). offset = np.array([center[0], center[1], 0]) # Calculate scaling for ellipse axes based on input camera positions. sc = np.percentile(np.abs(poses[:, :3, 3] - offset), 90, axis=0) # Use ellipse that is symmetric about the focal point in xy. low = -sc + offset high = sc + offset # Optional height variation need not be symmetric z_low = np.percentile((poses[:, :3, 3]), 10, axis=0) z_high = np.percentile((poses[:, :3, 3]), 90, axis=0) def get_positions(theta): # Interpolate between bounds with trig functions to get ellipse in x-y. # Optionally also interpolate in z to change camera height along path. return np.stack([ low[0] + (high - low)[0] * (np.cos(theta) * .5 + .5), low[1] + (high - low)[1] * (np.sin(theta) * .5 + .5), z_variation * (z_low[2] + (z_high - z_low)[2] * (np.cos(theta + 2 * np.pi * z_phase) * .5 + .5)), ], -1) theta = np.linspace(0, 2. * np.pi, n_frames + 1, endpoint=True) positions = get_positions(theta) # Throw away duplicated last position. positions = positions[:-1] # Set path's up vector to axis closest to average of input pose up vectors. avg_up = poses[:, :3, 1].mean(0) avg_up = avg_up / np.linalg.norm(avg_up) ind_up = np.argmax(np.abs(avg_up)) up = np.eye(3)[ind_up] * np.sign(avg_up[ind_up]) return np.stack([viewmatrix(p - center, up, p) for p in positions]) def generate_path(viewpoint_cameras, n_frames=480): c2ws = np.array([np.linalg.inv(np.asarray((cam.world_view_transform.T).cpu().numpy())) for cam in viewpoint_cameras]) pose = c2ws[:,:3,:] @ np.diag([1, -1, -1, 1]) pose_recenter, colmap_to_world_transform = transform_poses_pca(pose) # generate new poses new_poses = generate_ellipse_path(poses=pose_recenter, n_frames=n_frames) # warp back to orignal scale new_poses = np.linalg.inv(colmap_to_world_transform) @ pad_poses(new_poses) traj = [] for c2w in new_poses: c2w = c2w @ np.diag([1, -1, -1, 1]) cam = copy.deepcopy(viewpoint_cameras[0]) cam.image_height = int(cam.image_height / 2) * 2 cam.image_width = int(cam.image_width / 2) * 2 cam.world_view_transform = torch.from_numpy(np.linalg.inv(c2w).T).float().cuda() cam.full_proj_transform = (cam.world_view_transform.unsqueeze(0).bmm(cam.projection_matrix.unsqueeze(0))).squeeze(0) cam.camera_center = cam.world_view_transform.inverse()[3, :3] traj.append(cam) return traj def load_img(pth: str) -> np.ndarray: """Load an image and cast to float32.""" with open(pth, 'rb') as f: image = np.array(Image.open(f), dtype=np.float32) return image def create_videos(base_dir, input_dir, out_name, num_frames=480): """Creates videos out of the images saved to disk.""" # Last two parts of checkpoint path are experiment name and scene name. video_prefix = f'{out_name}' zpad = max(5, len(str(num_frames - 1))) idx_to_str = lambda idx: str(idx).zfill(zpad) os.makedirs(base_dir, exist_ok=True) render_dist_curve_fn = np.log # Load one example frame to get image shape and depth range. depth_file = os.path.join(input_dir, 'vis', f'depth_{idx_to_str(0)}.tiff') depth_frame = load_img(depth_file) shape = depth_frame.shape p = 3 distance_limits = np.percentile(depth_frame.flatten(), [p, 100 - p]) lo, hi = [render_dist_curve_fn(x) for x in distance_limits] print(f'Video shape is {shape[:2]}') video_kwargs = { 'shape': shape[:2], 'codec': 'h264', 'fps': 60, 'crf': 18, } for k in ['depth', 'normal', 'color']: video_file = os.path.join(base_dir, f'{video_prefix}_{k}.mp4') input_format = 'gray' if k == 'alpha' else 'rgb' file_ext = 'png' if k in ['color', 'normal'] else 'tiff' idx = 0 if k == 'color': file0 = os.path.join(input_dir, 'renders', f'{idx_to_str(0)}.{file_ext}') else: file0 = os.path.join(input_dir, 'vis', f'{k}_{idx_to_str(0)}.{file_ext}') if not os.path.exists(file0): print(f'Images missing for tag {k}') continue print(f'Making video {video_file}...') with media.VideoWriter( video_file, **video_kwargs, input_format=input_format) as writer: for idx in tqdm(range(num_frames)): if k == 'color': img_file = os.path.join(input_dir, 'renders', f'{idx_to_str(idx)}.{file_ext}') else: img_file = os.path.join(input_dir, 'vis', f'{k}_{idx_to_str(idx)}.{file_ext}') if not os.path.exists(img_file): ValueError(f'Image file {img_file} does not exist.') img = load_img(img_file) if k in ['color', 'normal']: img = img / 255. elif k.startswith('depth'): img = render_dist_curve_fn(img) img = np.clip((img - np.minimum(lo, hi)) / np.abs(hi - lo), 0, 1) img = cm.get_cmap('turbo')(img)[..., :3] frame = (np.clip(np.nan_to_num(img), 0., 1.) * 255.).astype(np.uint8) writer.add_image(frame) idx += 1 def save_img_u8(img, pth): """Save an image (probably RGB) in [0, 1] to disk as a uint8 PNG.""" with open(pth, 'wb') as f: Image.fromarray( (np.clip(np.nan_to_num(img), 0., 1.) * 255.).astype(np.uint8)).save( f, 'PNG') def save_img_f32(depthmap, pth): """Save an image (probably a depthmap) to disk as a float32 TIFF.""" with open(pth, 'wb') as f: Image.fromarray(np.nan_to_num(depthmap).astype(np.float32)).save(f, 'TIFF') ================================================ FILE: tools/semantic_id.py ================================================ BACKGROUND = 0 text_label_dict = { 'window': BACKGROUND, 'sky': BACKGROUND, 'sky window': BACKGROUND, 'window sky': BACKGROUND, 'floor': 2, } ================================================ FILE: tools/sh_utils.py ================================================ # Copyright 2021 The PlenOctree Authors. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import torch C0 = 0.28209479177387814 C1 = 0.4886025119029199 C2 = [ 1.0925484305920792, -1.0925484305920792, 0.31539156525252005, -1.0925484305920792, 0.5462742152960396 ] C3 = [ -0.5900435899266435, 2.890611442640554, -0.4570457994644658, 0.3731763325901154, -0.4570457994644658, 1.445305721320277, -0.5900435899266435 ] C4 = [ 2.5033429417967046, -1.7701307697799304, 0.9461746957575601, -0.6690465435572892, 0.10578554691520431, -0.6690465435572892, 0.47308734787878004, -1.7701307697799304, 0.6258357354491761, ] def eval_sh(deg, sh, dirs): """ Evaluate spherical harmonics at unit directions using hardcoded SH polynomials. Works with torch/np/jnp. ... Can be 0 or more batch dimensions. Args: deg: int SH deg. Currently, 0-3 supported sh: jnp.ndarray SH coeffs [..., C, (deg + 1) ** 2] dirs: jnp.ndarray unit directions [..., 3] Returns: [..., C] """ assert deg <= 4 and deg >= 0 coeff = (deg + 1) ** 2 assert sh.shape[-1] >= coeff result = C0 * sh[..., 0] if deg > 0: x, y, z = dirs[..., 0:1], dirs[..., 1:2], dirs[..., 2:3] result = (result - C1 * y * sh[..., 1] + C1 * z * sh[..., 2] - C1 * x * sh[..., 3]) if deg > 1: xx, yy, zz = x * x, y * y, z * z xy, yz, xz = x * y, y * z, x * z result = (result + C2[0] * xy * sh[..., 4] + C2[1] * yz * sh[..., 5] + C2[2] * (2.0 * zz - xx - yy) * sh[..., 6] + C2[3] * xz * sh[..., 7] + C2[4] * (xx - yy) * sh[..., 8]) if deg > 2: result = (result + C3[0] * y * (3 * xx - yy) * sh[..., 9] + C3[1] * xy * z * sh[..., 10] + C3[2] * y * (4 * zz - xx - yy)* sh[..., 11] + C3[3] * z * (2 * zz - 3 * xx - 3 * yy) * sh[..., 12] + C3[4] * x * (4 * zz - xx - yy) * sh[..., 13] + C3[5] * z * (xx - yy) * sh[..., 14] + C3[6] * x * (xx - 3 * yy) * sh[..., 15]) if deg > 3: result = (result + C4[0] * xy * (xx - yy) * sh[..., 16] + C4[1] * yz * (3 * xx - yy) * sh[..., 17] + C4[2] * xy * (7 * zz - 1) * sh[..., 18] + C4[3] * yz * (7 * zz - 3) * sh[..., 19] + C4[4] * (zz * (35 * zz - 30) + 3) * sh[..., 20] + C4[5] * xz * (7 * zz - 3) * sh[..., 21] + C4[6] * (xx - yy) * (7 * zz - 1) * sh[..., 22] + C4[7] * xz * (xx - 3 * yy) * sh[..., 23] + C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)) * sh[..., 24]) return result def RGB2SH(rgb): return (rgb - 0.5) / C0 def SH2RGB(sh): return sh * C0 + 0.5 ================================================ FILE: tools/system_utils.py ================================================ # # Copyright (C) 2023, Inria # GRAPHDECO research group, https://team.inria.fr/graphdeco # All rights reserved. # # This software is free for non-commercial, research and evaluation use # under the terms of the LICENSE.md file. # # For inquiries contact george.drettakis@inria.fr # from errno import EEXIST from os import makedirs, path import os def mkdir_p(folder_path): # Creates a directory. equivalent to using mkdir -p on the command line try: makedirs(folder_path) except OSError as exc: # Python >2.5 if exc.errno == EEXIST and path.isdir(folder_path): pass else: raise def searchForMaxIteration(folder): saved_iters = [int(fname.split("_")[-1]) for fname in os.listdir(folder)] return max(saved_iters) ================================================ FILE: tools/termcolor.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import pprint import termcolor def red(x): return termcolor.colored(str(x), color="red") def green(x): return termcolor.colored(str(x), color="green") def blue(x): return termcolor.colored(str(x), color="blue") def cyan(x): return termcolor.colored(str(x), color="cyan") def yellow(x): return termcolor.colored(str(x), color="yellow") def magenta(x): return termcolor.colored(str(x), color="magenta") def grey(x): return termcolor.colored(str(x), color="grey") COLORS = { 'red': red, 'green': green, 'blue': blue, 'cyan': cyan, 'yellow': yellow, 'magenta': magenta, 'grey': grey } def PP(x): string = pprint.pformat(x, indent=2) if isinstance(x, dict): string = '{\n ' + string[1:-1] + '\n}' return string def alert(x, color='red'): color = COLORS[color] print(color('-' * 32)) print(color(f'* {x}')) print(color('-' * 32)) ================================================ FILE: tools/visualization.py ================================================ import wandb import imageio import torch import torchvision from matplotlib import pyplot as plt from torchvision.transforms import functional as torchvision_F PALETTE = [ (0, 0, 0), (174, 199, 232), (152, 223, 138), (31, 119, 180), (255, 187, 120), (188, 189, 34), (140, 86, 75), (255, 152, 150), (214, 39, 40), (197, 176, 213), (148, 103, 189), (196, 156, 148), (23, 190, 207), (247, 182, 210), (219, 219, 141), (255, 127, 14), (158, 218, 229), (44, 160, 44), (112, 128, 144), (227, 119, 194), (82, 84, 163), ] PALETTE = torch.tensor(PALETTE, dtype=torch.uint8) def wandb_image(images, from_range=(0, 1)): images = preprocess_image(images, from_range=from_range) wandb_image = wandb.Image(images) return wandb_image def preprocess_image(images, from_range=(0, 1), cmap="viridis"): min, max = from_range images = (images - min) / (max - min) images = images.detach().cpu().float().clamp_(min=0, max=1) if images.shape[0] == 1: images = get_heatmap(images, cmap=cmap) images = tensor2pil(images) return images def wandb_sem(image, palette=PALETTE): image = image.detach().long().cpu() image = PALETTE[image].float().permute(2, 0, 1)[None] image = tensor2pil(image) wandb_image = wandb.Image(image) return wandb_image def tensor2pil(images): image_grid = torchvision.utils.make_grid(images, nrow=1, pad_value=1) image_grid = torchvision_F.to_pil_image(image_grid) return image_grid def get_heatmap(gray, cmap): # [N,H,W] color = plt.get_cmap(cmap)(gray.numpy()) color = torch.from_numpy(color[..., :3]).permute(0, 3, 1, 2).float() # [N,3,H,W] return color def save_render(render, path): image = torch.clamp(render, 0.0, 1.0).detach().cpu() image = (image.permute(1, 2, 0).numpy() * 255).astype('uint8') # [..., ::-1] imageio.imsave(path, image) ================================================ FILE: tools/visualize.py ================================================ ''' ----------------------------------------------------------------------------- Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. NVIDIA CORPORATION and its licensors retain all intellectual property and proprietary rights in and to this software, related documentation and any modifications thereto. Any use, reproduction, disclosure or distribution of this software and related documentation without an express license agreement from NVIDIA CORPORATION is strictly prohibited. ----------------------------------------------------------------------------- ''' import numpy as np import torch import matplotlib.pyplot as plt import plotly.graph_objs as go import k3d from tools import camera def get_camera_mesh(pose, depth=1): vertices = torch.tensor([[-0.5, -0.5, 1], [0.5, -0.5, 1], [0.5, 0.5, 1], [-0.5, 0.5, 1], [0, 0, 0]]) * depth # [6,3] faces = torch.tensor([[0, 1, 2], [0, 2, 3], [0, 1, 4], [1, 2, 4], [2, 3, 4], [3, 0, 4]]) # [6,3] vertices = camera.cam2world(vertices[None], pose) # [N,6,3] wireframe = vertices[:, [0, 1, 2, 3, 0, 4, 1, 2, 4, 3]] # [N,10,3] return vertices, faces, wireframe def merge_meshes(vertices, faces): mesh_N, vertex_N = vertices.shape[:2] faces_merged = torch.cat([faces + i * vertex_N for i in range(mesh_N)], dim=0) vertices_merged = vertices.view(-1, vertices.shape[-1]) return vertices_merged, faces_merged def merge_wireframes_k3d(wireframe): wf_first, wf_last, wf_dummy = wireframe[:, :1], wireframe[:, -1:], wireframe[:, :1] * np.nan wireframe_merged = torch.cat([wf_first, wireframe, wf_last, wf_dummy], dim=1) return wireframe_merged def merge_wireframes_plotly(wireframe): wf_dummy = wireframe[:, :1] * np.nan wireframe_merged = torch.cat([wireframe, wf_dummy], dim=1).view(-1, 3) return wireframe_merged def get_xyz_indicators(pose, length=0.1): xyz = torch.eye(4, 3)[None] * length xyz = camera.cam2world(xyz, pose) return xyz def merge_xyz_indicators_k3d(xyz): # [N,4,3] xyz = xyz[:, [[-1, 0], [-1, 1], [-1, 2]]] # [N,3,2,3] xyz_0, xyz_1 = xyz.unbind(dim=2) # [N,3,3] xyz_dummy = xyz_0 * np.nan xyz_merged = torch.stack([xyz_0, xyz_0, xyz_1, xyz_1, xyz_dummy], dim=2) # [N,3,5,3] return xyz_merged def merge_xyz_indicators_plotly(xyz): # [N,4,3] xyz = xyz[:, [[-1, 0], [-1, 1], [-1, 2]]] # [N,3,2,3] xyz_0, xyz_1 = xyz.unbind(dim=2) # [N,3,3] xyz_dummy = xyz_0 * np.nan xyz_merged = torch.stack([xyz_0, xyz_1, xyz_dummy], dim=2) # [N,3,3,3] xyz_merged = xyz_merged.view(-1, 3) return xyz_merged def k3d_visualize_pose(poses, vis_depth=0.5, xyz_length=0.1, center_size=0.1, xyz_width=0.02, mesh_opacity=0.05): # poses has shape [N,3,4] potentially in sequential order N = len(poses) centers_cam = torch.zeros(N, 1, 3) centers_world = camera.cam2world(centers_cam, poses) centers_world = centers_world[:, 0] # Get the camera wireframes. vertices, faces, wireframe = get_camera_mesh(poses, depth=vis_depth) xyz = get_xyz_indicators(poses, length=xyz_length) vertices_merged, faces_merged = merge_meshes(vertices, faces) wireframe_merged = merge_wireframes_k3d(wireframe) xyz_merged = merge_xyz_indicators_k3d(xyz) # Set the color map for the camera trajectory and the xyz indicators. color_map = plt.get_cmap("gist_rainbow") center_color = [] vertices_merged_color = [] wireframe_color = [] xyz_color = [] x_hex, y_hex, z_hex = int(255) << 16, int(255) << 8, int(255) for i in range(N): # Set the camera pose colors (with a smooth gradient color map). r, g, b, _ = color_map(i / (N - 1)) r, g, b = r * 0.8, g * 0.8, b * 0.8 pose_rgb_hex = (int(r * 255) << 16) + (int(g * 255) << 8) + int(b * 255) center_color += [pose_rgb_hex] vertices_merged_color += [pose_rgb_hex] * 5 wireframe_color += [pose_rgb_hex] * 13 # Set the xyz indicator colors. xyz_color += [x_hex] * 5 + [y_hex] * 5 + [z_hex] * 5 # Plot in K3D. k3d_objects = [ k3d.points(centers_world, colors=center_color, point_size=center_size, shader="3d"), k3d.mesh(vertices_merged, faces_merged, colors=vertices_merged_color, side="double", opacity=mesh_opacity), k3d.line(wireframe_merged, colors=wireframe_color, shader="simple"), k3d.line(xyz_merged, colors=xyz_color, shader="thick", width=xyz_width), ] return k3d_objects def plotly_visualize_pose(poses, vis_depth=0.5, xyz_length=0.5, center_size=2, xyz_width=5, mesh_opacity=0.05): # poses has shape [N,3,4] potentially in sequential order N = len(poses) centers_cam = torch.zeros(N, 1, 3) centers_world = camera.cam2world(centers_cam, poses) centers_world = centers_world[:, 0] # Get the camera wireframes. vertices, faces, wireframe = get_camera_mesh(poses, depth=vis_depth) xyz = get_xyz_indicators(poses, length=xyz_length) vertices_merged, faces_merged = merge_meshes(vertices, faces) wireframe_merged = merge_wireframes_plotly(wireframe) xyz_merged = merge_xyz_indicators_plotly(xyz) # Break up (x,y,z) coordinates. wireframe_x, wireframe_y, wireframe_z = wireframe_merged.unbind(dim=-1) xyz_x, xyz_y, xyz_z = xyz_merged.unbind(dim=-1) centers_x, centers_y, centers_z = centers_world.unbind(dim=-1) vertices_x, vertices_y, vertices_z = vertices_merged.unbind(dim=-1) # Set the color map for the camera trajectory and the xyz indicators. color_map = plt.get_cmap("gist_rainbow") center_color = [] faces_merged_color = [] wireframe_color = [] xyz_color = [] x_color, y_color, z_color = *np.eye(3).T, for i in range(N): # Set the camera pose colors (with a smooth gradient color map). r, g, b, _ = color_map(i / (N - 1)) rgb = np.array([r, g, b]) * 0.8 wireframe_color += [rgb] * 11 center_color += [rgb] faces_merged_color += [rgb] * 6 xyz_color += [x_color] * 3 + [y_color] * 3 + [z_color] * 3 # Plot in plotly. plotly_traces = [ go.Scatter3d(x=wireframe_x, y=wireframe_y, z=wireframe_z, mode="lines", line=dict(color=wireframe_color, width=1)), go.Scatter3d(x=xyz_x, y=xyz_y, z=xyz_z, mode="lines", line=dict(color=xyz_color, width=xyz_width)), go.Scatter3d(x=centers_x, y=centers_y, z=centers_z, mode="markers", marker=dict(color=center_color, size=center_size, opacity=1)), go.Mesh3d(x=vertices_x, y=vertices_y, z=vertices_z, i=[f[0] for f in faces_merged], j=[f[1] for f in faces_merged], k=[f[2] for f in faces_merged], facecolor=faces_merged_color, opacity=mesh_opacity), ] return plotly_traces ================================================ FILE: train.py ================================================ import os import sys import argparse sys.path.append(os.getcwd()) from configs.config import Config, recursive_update_strict, parse_cmdline_arguments from trainer import Trainer def parse_args(): parser = argparse.ArgumentParser(description='Training') parser.add_argument('--config', help='Path to the training config file.', required=True) parser.add_argument('--wandb', action='store_true', help="Enable using Weights & Biases as the logger") parser.add_argument('--wandb_name', default='default', type=str) args, cfg_cmd = parser.parse_known_args() return args, cfg_cmd def main(): args, cfg_cmd = parse_args() cfg = Config(args.config) cfg_cmd = parse_cmdline_arguments(cfg_cmd) recursive_update_strict(cfg, cfg_cmd) trainer = Trainer(cfg) cfg.save_config(cfg.logdir) trainer.init_wandb(cfg, project=args.wandb_name, mode="disabled" if cfg.train.debug_from > -1 or not args.wandb else "online", use_group=True) trainer.train() trainer.finalize() return if __name__ == "__main__": main() ================================================ FILE: trainer.py ================================================ import os import json import uuid import math import wandb import imageio import numpy as np from torch import nn from tqdm import tqdm import math from random import randint import torch.nn.functional as F from argparse import Namespace from pytorch3d.ops import knn_points from torchmetrics import JaccardIndex import torch import matplotlib.pyplot as plt from copy import deepcopy from tools.loss_utils import l1_loss, ssim, cos_weight, entropy_loss, monosdf_normal_loss, ScaleAndShiftInvariantLoss from gaussian_renderer import render, network_gui from scene import Scene, GaussianModel from tools.image_utils import psnr from configs.config import Config from tools.visualization import wandb_image, preprocess_image, wandb_sem from tools.prune import prune_list, calculate_v_imp_score, get_visi_list from tools.loss_utils import compute_normal_loss, L1_loss_appearance, normal2curv from tools.camera_utils import bb_camera from tools.general_utils import safe_state, set_random_seed from scene.cameras import SampleCam from tools.normal_utils import get_normal_sign, get_edge_aware_distortion_map # from process_data.extract_mask import text_label_dict try: from torch.utils.tensorboard import SummaryWriter TENSORBOARD_FOUND = True except ImportError: TENSORBOARD_FOUND = False class Trainer(object): def __init__(self, cfg): self.cfg = cfg set_random_seed(cfg.seed) cfg.model.model_path = cfg.logdir self.sphere = getattr(cfg.model, 'sphere', False) cfg.model.load_normal = cfg.optim.loss_weight.mono_normal > 0 \ or cfg.optim.loss_weight.depth_normal > 0 cfg.model.load_depth = cfg.optim.loss_weight.mono_depth > 0 self.enable_semantic = getattr(cfg.optim.loss_weight, 'semantic', 0) > 0 cfg.model.enable_semantic = self.enable_semantic cfg.model.load_mask = self.enable_semantic or cfg.model.load_mask cfg.print_config() safe_state(cfg.silent) self.setup_model(cfg.model) self.setup_dataset(cfg.model) self.setup_optimizer(cfg.optim) self.init_attributes() self.init_losses() # Start GUI server, configure and run training if cfg.port > 0: network_gui.init(cfg.ip, cfg.port) torch.autograd.set_detect_anomaly(cfg.detect_anomaly) def setup_model(self, cfg): self.model = GaussianModel(cfg) def setup_dataset(self, cfg): os.makedirs(cfg.model_path, exist_ok = True) self.scene = Scene(cfg, self.model) self.model.trans = torch.from_numpy(self.scene.trans).cuda() self.model.scale = torch.from_numpy(self.scene.scale).cuda() self.model.extent = self.scene.cameras_extent def init_writer(self, cfg): if not cfg.model.model_path: if os.getenv('OAR_JOB_ID'): unique_str=os.getenv('OAR_JOB_ID') else: unique_str = str(uuid.uuid4()) cfg.model.model_path = os.path.join("./output/", unique_str[0:10]) # Set up output folder print("Output folder: {}".format(cfg.model.model_path)) os.makedirs(cfg.model.model_path, exist_ok = True) with open(os.path.join(cfg.model.model_path, "cfg_args"), 'w') as cfg_log_f: cfg_log_f.write(str(Namespace(**vars(cfg)))) # Create Tensorboard writer if TENSORBOARD_FOUND: self.writer = SummaryWriter(cfg.model.model_path) else: print("Tensorboard not available: not logging progress") def init_wandb(self, cfg, wandb_id=None, project="", run_name=None, mode="online", resume="allow", use_group=False): r"""Initialize Weights & Biases (wandb) logger. Args: cfg (obj): Global configuration. wandb_id (str): A unique ID for this run, used for resuming. project (str): The name of the project where you're sending the new run. If the project is not specified, the run is put in an "Uncategorized" project. run_name (str): name for each wandb run (useful for logging changes) mode (str): online/offline/disabled """ print('Initialize wandb') if not wandb_id: wandb_path = os.path.join(cfg.logdir, "wandb_id.txt") if os.path.exists(wandb_path): with open(wandb_path, "r") as f: wandb_id = f.read() else: wandb_id = wandb.util.generate_id() with open(wandb_path, "w") as f: f.write(wandb_id) if use_group: group, name = cfg.logdir.split("/")[-2:] else: group, name = None, os.path.basename(cfg.logdir) if run_name is not None: name = run_name wandb.init(id=wandb_id, project=project, config=cfg, group=group, name=name, dir=cfg.logdir, resume=resume, settings=wandb.Settings(start_method="fork"), mode=mode) wandb.config.update({'dataset': cfg.data.name}) def init_losses(self): r"""Initialize loss functions. All loss names have weights. Some have criterion modules.""" self.losses = dict() self.weights = {key: value for key, value in self.cfg.optim.loss_weight.items() if value} if 'mono_depth' in self.weights: self.depth_loss = ScaleAndShiftInvariantLoss(alpha=0.5, scales=1) def setup_optimizer(self, cfg): self.model.training_setup(cfg) def init_attributes(self): self.iter_start = torch.cuda.Event(enable_timing = True) self.iter_end = torch.cuda.Event(enable_timing = True) self.viewpoint_stack = None self.ema_loss_for_log = 0.0 self.current_iteration = 0 self.max_iters = self.cfg.optim.iterations self.saving_iterations = self.cfg.train.save_iterations self.testing_iterations = self.cfg.train.test_iterations self.checkpoint_iterations = self.cfg.train.checkpoint_iterations self.debug_from = self.cfg.train.debug_from self.checkpoint = self.cfg.train.start_checkpoint self.star_ft_iter = None self.visi_list = None self.first_iter = 0 if self.checkpoint: (model_params, self.first_iter) = torch.load(self.checkpoint) self.model.restore(model_params, self.cfg.optim) bg_color = [1, 1, 1] if self.cfg.model.white_background else [0, 0, 0] self.background = torch.tensor(bg_color, dtype=torch.float32, device="cuda") self.writer = None with open(os.path.join(self.cfg.model.model_path, "cfg_args"), 'w') as cfg_log_f: cfg_log_f.write(str(Namespace(**vars(self.cfg)))) self.vis_path = os.path.join(self.cfg.logdir, "vis") self.vis_color_path = os.path.join(self.vis_path, "color") self.vis_depth_path = os.path.join(self.vis_path, "depth") self.vis_normal_path = os.path.join(self.vis_path, "normal") self.vis_dnormal_path = os.path.join(self.vis_path, "dnormal") self.vis_cos_path = os.path.join(self.vis_path, "cos") for mode in ['train', 'test']: os.makedirs(os.path.join(self.vis_color_path, mode), exist_ok=True) os.makedirs(os.path.join(self.vis_depth_path, mode), exist_ok=True) os.makedirs(os.path.join(self.vis_normal_path, mode), exist_ok=True) os.makedirs(os.path.join(self.vis_dnormal_path, mode), exist_ok=True) os.makedirs(os.path.join(self.vis_cos_path, mode), exist_ok=True) if self.enable_semantic: self.calc_miou = JaccardIndex(num_classes=self.model.num_cls, task='multiclass').cuda() def train(self): progress_bar = tqdm(range(self.first_iter, self.max_iters), desc="Training progress") self.current_iteration += self.first_iter self.first_iter += 1 for iteration in range(self.first_iter, self.max_iters + 1): self.current_iteration += 1 self.start_of_iteration() output = self.train_step(mode='train') self.end_of_iteration(output, render, progress_bar) def get_center_scale(self): meta_fname = f"{self.cfg.model.source_path}/meta.json" with open(meta_fname) as file: meta = json.load(file) # center scene trans = np.array(meta["trans"], dtype=np.float32) trans = torch.from_numpy(trans.astype(np.float32)).to("cuda") self.model.trans = torch.nn.parameter.Parameter(trans, requires_grad=False) # scale scene scale = np.array(meta["scale"], dtype=np.float32) scale = torch.from_numpy(scale.astype(np.float32)).to("cuda") self.model.scale = torch.nn.parameter.Parameter(scale, requires_grad=False) def model_forward(self, data, mode): render_pkg = render(data['viewpoint_cam'], self.model, self.cfg, data.pop('bg'), dirs=self.scene.dirs) data.update(render_pkg) self._compute_loss(data, mode) loss = self._get_total_loss() return loss def _compute_loss(self, data, mode=None): if mode == 'train': gt_image = data['viewpoint_cam'].original_image.cuda() self.losses['l1'] = l1_loss(data['render'], gt_image) if not self.cfg.model.use_decoupled_appearance \ else L1_loss_appearance(data['render'], gt_image, self.model, data['viewpoint_cam'].idx) self.losses['ssim'] = 1.0 - ssim(data['render'], gt_image) if 'l1_scale' in self.weights or 'entropy' in self.weights or 'proj' in self.weights or 'repul' in self.weights: mask, _ = self.model.get_inside_gaus_normalized() if 'l1_scale' in self.weights and not self.sphere: scaling = self.model.get_scaling[mask].min(-1)[0] self.losses['l1_scale'] = l1_loss(scaling, torch.zeros_like(scaling)) if 'entropy' in self.weights: opacity = self.model.get_opacity[mask] self.losses['entropy'] = entropy_loss(opacity) if 'mono_depth' in self.weights: render_depth = data['depth'] gt_depth = data['viewpoint_cam'].depth.cuda().float() mask = None if self.cfg.model.load_mask: mask = data['viewpoint_cam'].mask mask = render_depth > 0 self.losses['mono_depth'] = self.depth_loss(render_depth, gt_depth, mask) if 'mono_normal' in self.weights and self.current_iteration > self.cfg.optim.normal_from_iter: render_normal = data['normal'] gt_normal = data['viewpoint_cam'].normal.cuda() self.losses['mono_normal'] = monosdf_normal_loss(render_normal, gt_normal) if 'depth_normal' in self.weights and self.current_iteration > self.cfg.optim.dnormal_from_iter: est_normal = data['est_normal'] gt_normal = data['viewpoint_cam'].normal.cuda() render_normal = data['normal'].detach() mask = data['mask'] with torch.no_grad(): weights = cos_weight(render_normal, gt_normal, self.cfg.optim.exp_t) if mask.sum() != 0: est_normal, gt_normal = est_normal[mask], gt_normal[mask] render_normal = render_normal[mask] weights = weights[mask] self.losses['depth_normal'] = monosdf_normal_loss(est_normal, gt_normal, weights) else: self.losses['depth_normal'] = 0 if 'curv' in self.weights and self.current_iteration > self.cfg.optim.curv_from_iter: est_normal = data['est_normal'] # h, w, 3 mask = data['mask'][..., None].clone() # h, w, 1 mask = mask.float() curv = normal2curv(est_normal, mask) self.losses['curv'] = l1_loss(curv, 0) if 'consistent_normal' in self.weights and self.current_iteration > self.cfg.optim.consistent_normal_from_iter: est_normal = data['est_normal'] render_normal = data['normal'] mask = data['mask'] self.losses['consistent_normal'] = monosdf_normal_loss(est_normal, render_normal) if 'distortion' in self.weights and self.current_iteration > self.cfg.optim.close_depth_from_iter: distortion_map = data['distortion'] distortion_map = get_edge_aware_distortion_map(gt_image, distortion_map) self.losses['distortion'] = distortion_map.mean() if 'depth_var' in self.weights and self.current_iteration > self.cfg.optim.close_depth_from_iter: depth_var = data['depth_var'] depth_var = get_edge_aware_distortion_map(gt_image, depth_var) self.losses['depth_var'] = depth_var.mean() if 'semantic' in self.weights: sem_logits = data['render_sem'] sem_trg = data['viewpoint_cam'].mask.view(-1) self.losses['semantic'] = F.cross_entropy(sem_logits.view(-1, self.model.num_cls), sem_trg) / torch.log(torch.tensor(self.model.num_cls)) # normalize to (0,1) def _get_total_loss(self): r"""Return the total loss to be backpropagated. """ total_loss = torch.tensor(0., device=torch.device('cuda')) # Iterates over all possible losses. for loss_name in self.weights: if loss_name in self.losses: # Multiply it with the corresponding weight and add it to the total loss. total_loss += self.losses[loss_name] * self.weights[loss_name] self.losses['total'] = total_loss # logging purpose return total_loss def train_step(self, mode='train'): data = dict() # Pick a random Camera if not self.viewpoint_stack: self.viewpoint_stack = self.scene.getTrainCameras().copy() data['viewpoint_cam'] = self.viewpoint_stack.pop(randint(0, len(self.viewpoint_stack)-1)) # Render if (self.current_iteration - 1) == self.debug_from: self.cfg.pipline.debug = True data['bg'] = torch.rand((3), device="cuda") if self.cfg.optim.random_background else self.background loss = self.model_forward(data, mode) loss.backward() viewspace_point_tensor, visibility_filter, radii = data.pop("viewspace_points"), data.pop("visibility_filter"), data.pop("radii") with torch.no_grad(): # Densification if self.current_iteration < self.cfg.optim.densify_until_iter: # Keep track of max radii in image-space for pruning self.model.max_radii2D[visibility_filter] = torch.max(self.model.max_radii2D[visibility_filter], radii[visibility_filter]) viewspace_point_tensor_densify = data["viewspace_points_densify"] self.model.add_densification_stats(viewspace_point_tensor_densify, visibility_filter) # self.model.add_densification_stats(viewspace_point_tensor, visibility_filter) if self.current_iteration > self.cfg.optim.densify_from_iter \ and hasattr(self.cfg.optim, 'densify_large'): if 'countlist' in data: visi_list_each = data['countlist'] self.visi_list = visi_list_each if self.visi_list is None else self.visi_list + visi_list_each if self.current_iteration > self.cfg.optim.densify_from_iter and self.current_iteration % self.cfg.optim.densification_interval == 0: size_threshold = 20 if self.current_iteration > self.cfg.optim.opacity_reset_interval else None visi = None if getattr(self.cfg.optim, 'densify_large', False) and self.cfg.optim.densify_large.sample_cams.num > 0 \ and getattr(self.cfg.optim.densify_large, 'percent_dense', 0): visi = self.get_visi_mask_acc(self.cfg.optim.densify_large.sample_cams.num, self.cfg.optim.densify_large.sample_cams.up, self.cfg.optim.densify_large.sample_cams.around, sample_mode='random') if self.visi_list is not None: visi = visi & self.visi_list > 0 self.model.densify_and_prune(self.cfg.optim.densify_grad_threshold, 0.005, self.scene.cameras_extent, size_threshold, visi) self.visi_list = None if self.current_iteration % self.cfg.optim.opacity_reset_interval == 0 or \ (self.cfg.model.white_background and self.current_iteration == self.cfg.optim.densify_from_iter): self.model.reset_opacity() if self.current_iteration in self.cfg.optim.prune.iterations: # TODO Add prunning types n = int(len(self.scene.getFullCameras()) * 1.2) viewpoint_stack = self.scene.getFullCameras().copy() gaussian_list, imp_list = prune_list(self.model, viewpoint_stack, self.cfg.pipline, self.background) i = self.cfg.optim.prune.iterations.index(self.current_iteration) v_list = calculate_v_imp_score(self.model, imp_list, self.cfg.optim.prune.v_pow) self.model.prune_gaussians( (self.cfg.optim.prune.decay**i) * self.cfg.optim.prune.percent, v_list ) # Optimizer step self.model.optimizer.step() self.model.optimizer.zero_grad(set_to_none = True) return data def start_of_iteration(self): self.iter_start.record() # train or fine-tune iter = self.current_iteration if self.star_ft_iter is None \ else self.current_iteration - self.star_ft_iter self.model.update_learning_rate(iter) # Every 1000 its we increase the levels of SH up to a maximum degree if self.current_iteration % 1000 == 0: self.model.oneupSHdegree() def end_of_iteration(self, output, render, progress_bar): self.iter_end.record() with torch.no_grad(): # Progress bar self.ema_loss_for_log = 0.4 * self.losses['total'].item() + 0.6 * self.ema_loss_for_log if self.current_iteration % 10 == 0: progress_bar.set_postfix({"Loss": f"{self.ema_loss_for_log:.{7}f}"}) progress_bar.update(10) if self.current_iteration == self.max_iters : progress_bar.close() # Log and save if self.writer: self.log_writer(output, mode="train") else: output.update(self.test(render)) self.log_wandb_scalars(output, mode="train") if (self.current_iteration in self.saving_iterations) or (self.current_iteration == self.max_iters): self.save_gaussians() if (self.current_iteration in self.checkpoint_iterations) or (self.current_iteration == self.max_iters): print("\n[ITER {}] Saving Checkpoint".format(self.current_iteration)) torch.save((self.model.capture(), self.current_iteration), self.scene.model_path + "/chkpnt" + str(self.current_iteration) + ".pth") if len(self.cfg.optim.prune.iterations) > 0 and self.current_iteration == self.max_iters: viewpoint_stack = self.scene.getFullCameras().copy() gaussian_list, imp_list = prune_list(self.model, viewpoint_stack, self.cfg.pipline, self.background) v_list = calculate_v_imp_score(self.model, imp_list, self.cfg.optim.prune.v_pow) np.savez(os.path.join(self.scene.model_path, "imp_score"), v_list.cpu().detach().numpy()) def log_wandb_scalars(self, output, mode=None): scalars = dict() if mode == "train": for param_group in self.model.optimizer.param_groups: scalars.update({"optim/lr_{}".format(param_group["name"]): param_group['lr']}) scalars.update({"time/iteration": self.iter_start.elapsed_time(self.iter_end)}) scalars.update({f"loss/{mode}_{key}": value for key, value in self.losses.items()}) scalars.update(iteration=self.current_iteration) scalars.update({k: v for k, v in output.items() if isinstance(v, (int, float))}) wandb.log(scalars, step=self.current_iteration) def log_wandb_images(self, data, mode=None): image = torch.cat([data["rgb_map"], data["image"]], dim=1) depth = data["depth_map"] inv_depth = depth.max() - depth images = {f'vis/{mode}': wandb_image(image), f'vis/{mode}_depth': wandb_image(depth, from_range=(depth.min(), depth.max())), f'vis/{mode}_inv_depth': wandb_image(inv_depth, from_range=(inv_depth.min(), inv_depth.max()))} if 'depth_var' in data: depth_var = data['depth_var'] images.update({f'vis/{mode}_depth_var': wandb_image(depth_var, from_range=(depth_var.min(), depth_var.max()))}) if 'depth' in data: depth = data["depth"].detach().clone() images.update({f'vis/{mode}_depth_gt': wandb_image(depth, from_range=(depth.min(), depth.max()))}) if 'mask' in data: mask = data['mask'].detach().clone().float() images.update({f'vis/{mode}_mask': wandb_image(mask)}) if 'normal_map' in data: normal_map = data["normal_map"] images.update({f'vis/{mode}_normal': wandb_image(normal_map.permute(2, 0, 1), from_range=(-1, 1))}) if 'normal' in data: normal = data["normal"].detach().clone() images.update({f'vis/{mode}_normal_gt': wandb_image(normal.permute(2, 0, 1), from_range=(-1, 1))}) cos = cos_weight(normal.cuda(), normal_map, self.cfg.optim.exp_t) images.update({f'vis/{mode}_normal_cos': wandb_image(cos, from_range=(0, 1))}) if 'est_normal' in data: est_normal = data["est_normal"].permute(2, 0, 1).detach().clone() images.update({f'vis/{mode}_est_normal': wandb_image(est_normal, from_range=(-1, 1))}) if 'transformed_est_normal' in data: transformed_est_normal = data["transformed_est_normal"].permute(2, 0, 1).detach().clone() images.update({f'vis/{mode}_trans_est_normal': wandb_image(transformed_est_normal, from_range=(-1, 1))}) if 'sem' in data: sem = data['sem'] images.update({f'vis/{mode}_sem': wandb_sem(sem)}) if 'distortion' in data: distortion = data['distortion'] images.update({f'vis/{mode}_distortion': wandb_image(distortion, from_range=(distortion.min(), distortion.max()))}) if 'depth_var' in data: depth_var = data['depth_var'] images.update({f'vis/{mode}_depth_var': wandb_image(depth_var, from_range=(depth_var.min(), depth_var.max()))}) if 'trans_image' in data: trans_image = data['trans_image'] images.update({f'vis/{mode}_trans': wandb_image(trans_image)}) wandb.log(images, step=self.current_iteration) def log_hist(self, tensor, name, num_bin=10): counts, bins = np.histogram(tensor, bins=num_bin) density = counts / counts.sum() plt.stairs(density, bins) plt.title('Histogram {}'.format(name)) wandb.log({f'statistic/{name}': wandb.Image(plt)}, step=self.current_iteration) plt.close() @torch.no_grad() def test(self, renderFunc): output = dict() # Report test and samples of training set if (self.current_iteration in self.testing_iterations) or (self.current_iteration == self.max_iters): torch.cuda.empty_cache() validation_configs = ({'name': 'test', 'cameras' : self.scene.getTestCameras()}, {'name': 'train', 'cameras' : self.scene.getTrainCameras()}) for config in validation_configs: if config['cameras'] and len(config['cameras']) > 0: l1_test = 0.0 psnr_test = 0.0 for idx, viewpoint in enumerate(config['cameras']): out = renderFunc(viewpoint, self.model, self.cfg, self.background, dirs=self.scene.dirs) image = torch.clamp(out["render"], 0.0, 1.0) gt_image = torch.clamp(viewpoint.original_image.to("cuda"), 0.0, 1.0) if config['name'] == 'train' and self.cfg.model.use_decoupled_appearance: trans_image = L1_loss_appearance(image, gt_image, self.model, viewpoint.idx, return_transformed_image=True) depth = out["depth"] normal = out["normal"] if "normal" in out else None est_normal = out["est_normal"] if "est_normal" in out else None if 'render_sem' in out: pred = self.model.logits_2_label(out['render_sem']) sem_mask = viewpoint.mask.cuda() self.calc_miou.update(pred, sem_mask) if viewpoint.image_name == self.scene.first_name: data = {"image": gt_image, "rgb_map": image, "depth_map": depth} if config['name'] == 'train' and self.cfg.model.use_decoupled_appearance: data['trans_image'] = trans_image if 'mask' in out: data['mask'] = out['mask'] if viewpoint.depth is not None: data['depth'] = viewpoint.depth if 'depth_var' in out: data['depth_var'] = out['depth_var'] if 'distortion' in out: data['distortion'] = out['distortion'] if normal is not None: data["normal_map"] = normal if viewpoint.normal is not None: data['normal'] = viewpoint.normal if est_normal is not None: data['est_normal'] = est_normal if 'render_sem' in out: pred = self.model.logits_2_label(out['render_sem']).to(torch.uint8) data['sem'] = torch.cat([pred, sem_mask], dim=0) self.log_wandb_images(data, mode=config['name']) if False: data = {"image": gt_image, "rgb_map": image, "depth_map": depth} if 'mask' in out: data['mask'] = out['mask'] if viewpoint.depth is not None: data['depth'] = viewpoint.depth if 'depth_var' in out: data['depth_var'] = out['depth_var'] if normal is not None: data["normal_map"] = normal if viewpoint.normal is not None: data['normal'] = viewpoint.normal if est_normal is not None: data['est_normal'] = est_normal cos = cos_weight(normal.cuda(), normal, self.cfg.optim.exp_t) data['normal_cos'] = cos self.save_vis(data, viewpoint.image_name, mode=config['name']) l1_test += l1_loss(image, gt_image).mean().double() psnr_test += psnr(image, gt_image).mean().double() psnr_test /= len(config['cameras']) l1_test /= len(config['cameras']) if self.enable_semantic: miou = self.calc_miou.compute() self.calc_miou.reset() output.update({ f'statistic/{config["name"]}_PSNR': psnr_test.item(), f'loss/{config["name"]}_l1': l1_test.item(), }) if self.enable_semantic: output[f'statistic/{config["name"]}_mIoU'] = miou.item() output.update({ 'statistic/total_points': self.scene.gaussians.get_xyz.shape[0], }) self.log_hist(self.model.get_opacity.cpu().numpy(), "opacity") torch.cuda.empty_cache() return output def finalize(self): # Finish the W&B logger. wandb.finish() def log_writer(self, mode=None): if self.writer: for key, value in self.losses.items(): self.writer.add_scalar(f"loss/{mode}_{key}", value, global_step=self.current_iteration) def save_vis(self, data, name, mode='train'): image = torch.clamp(data["rgb_map"], 0.0, 1.0).detach().cpu() image = (image.permute(1, 2, 0).numpy() * 255).astype('uint8') imageio.imsave(os.path.join(self.vis_color_path, mode, f"{name}.png"), image) normal = preprocess_image(data["normal_map"].permute(2, 0, 1), from_range=(-1, 1)) normal.save(os.path.join(self.vis_normal_path, mode, f"{name}.png")) if False: normal_gt = preprocess_image(data["normal"].permute(2, 0, 1), from_range=(-1, 1)) gt_normal_path = os.path.join(self.vis_normal_path+'_gt', mode) if not os.path.exists(gt_normal_path): os.makedirs(gt_normal_path, exist_ok=True) normal_gt.save(os.path.join(gt_normal_path, f"{name}.png")) dnormal = preprocess_image(data["est_normal"].permute(2, 0, 1), from_range=(-1, 1)) dnormal.save(os.path.join(self.vis_dnormal_path, mode, f"{name}.png")) cos = preprocess_image(data["normal_cos"], from_range=(0, 1)) cos.save(os.path.join(self.vis_cos_path, mode, f"{name}.png")) return def sample_cameras(self, n, up=False, around=True, look_mode='target', sample_mode='grid', bidirect=True): # direction target cam_height = None w2cs = bb_camera(n, self.model.trans, self.model.scale, cam_height, up=up, around=around, \ look_mode=look_mode, sample_mode=sample_mode, bidirect=bidirect) FoVx = FoVy = 2.5 width = height = 1500 cams = [] for i in range(w2cs.shape[0]): w2c = w2cs[i] cam = SampleCam(w2c, width, height, FoVx, FoVy) cams.append(cam) return cams @torch.no_grad() def get_visi_mask(self, n=500, up=False, around=True, denoise_after=False, \ denoise_before=True, nb_points=10, viewpoint_stack=None, sample_mode='grid', cat_cams=False): # direction target if viewpoint_stack is None: if self.cfg.optim.densify_large.sample_cams.random: viewpoint_stack = self.sample_cameras(n, up, around, sample_mode=sample_mode) if cat_cams: viewpoint_stack += self.scene.getTrainCameras().copy() else: viewpoint_stack = self.scene.getTrainCameras().copy() model = deepcopy(self.model) if denoise_before: mask = torch.ones(model.get_xyz.shape[0], dtype=torch.bool, device="cuda") valid = model.filter_points() mask[valid] = False model.prune_points(mask) else: mask = torch.zeros(model.get_xyz.shape[0], dtype=torch.bool, device="cuda") xyz = model.get_xyz[None] dist2 = knn_points(xyz, xyz, K=nb_points+1, return_sorted=True).dists # 1, N, K dist2 = dist2[0, :, 1:] dist2 = torch.clamp_min(dist2, 0.0000001) dist = (torch.sqrt(dist2)).mean(-1) scaling = dist scales = torch.log(scaling)[...,None].repeat(1, 3) idx = torch.argmin(model.get_scaling, dim=-1) scales[torch.arange(scales.shape[0]), idx] = math.log(1e-7) model._scaling = nn.Parameter(scales.requires_grad_(True)) out = get_visi_list(model, viewpoint_stack, self.cfg.pipline, self.background) visi = out['visi'] valid = ~mask if denoise_after: model.prune_points(~visi) filted = model.filter_points() visi[visi.clone()] = filted valid[~mask] = visi del model return valid @torch.no_grad() def get_visi_mask_acc(self, n=500, up=False, around=True, sample_mode='grid', viewpoint_stack=None): if viewpoint_stack is None: if self.cfg.optim.densify_large.sample_cams.random: viewpoint_stack = self.sample_cameras(n, up, around, sample_mode=sample_mode) else: fullcam = self.scene.getTrainCameras().copy() idx = torch.randint(0, len(fullcam), (n,)) viewpoint_stack = [fullcam[i] for i in idx] out = get_visi_list(self.model, viewpoint_stack, self.cfg.pipline, self.background) visi = out['visi'] inside = self.model.get_inside_gaus_normalized()[0] valid = visi & inside return valid @torch.no_grad() def save_gaussians(self): print("\n[ITER {}] Saving Gaussians".format(self.current_iteration)) surfmask = None visi = None self.scene.save(self.current_iteration, visi=visi, surf=surfmask, save_splat=self.cfg.train.save_splat) if __name__ == "__main__": from configs.config import Config import sys sys.path.append(os.getcwd()) cfg_path = 'projects/gaussain_splatting/configs/base.yaml' cfg = Config(cfg_path) trainer = Trainer(cfg) trainer.get_center_scale() for thr in np.linspace(0.9, 1., 11): trainer.save_pts_thr(thr)