Repository: kxhit/vMAP Branch: master Commit: ac4aefe9fd90 Files: 31 Total size: 158.6 KB Directory structure: gitextract_se2unowv/ ├── .gitignore ├── LICENSE ├── README.md ├── cfg.py ├── configs/ │ ├── Replica/ │ │ ├── config_replica_room0_iMAP.json │ │ └── config_replica_room0_vMAP.json │ └── ScanNet/ │ ├── config_scannet0000_iMAP.json │ ├── config_scannet0000_vMAP.json │ ├── config_scannet0024_iMAP.json │ └── config_scannet0024_vMAP.json ├── data_generation/ │ ├── README.md │ ├── extract_inst_obj.py │ ├── habitat_renderer.py │ ├── replica_render_config_vMAP.yaml │ ├── settings.py │ └── transformation.py ├── dataset.py ├── embedding.py ├── environment.yml ├── image_transforms.py ├── loss.py ├── metric/ │ ├── eval_3D_obj.py │ ├── eval_3D_scene.py │ └── metrics.py ├── model.py ├── render_rays.py ├── train.py ├── trainer.py ├── utils.py ├── vis.py └── vmap.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ /.anaconda3/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ HierarchicalPriors.sublime-project HierarchicalPriors.sublime-workspace scripts/res1/ scripts/results/ scripts/traj.txt vids_ims res_datasets/ results/ ScenePriors/train/examples/experiments/ *.pkl *.idea ================================================ FILE: LICENSE ================================================ vMAP SOFTWARE LICENCE AGREEMENT WE (Imperial College of Science, Technology and Medicine, (“Imperial College London”)) ARE WILLING TO LICENSE THIS SOFTWARE TO YOU (a licensee “You”) ONLY ON THE CONDITION THAT YOU ACCEPT ALL OF THE TERMS CONTAINED IN THE FOLLOWING AGREEMENT. PLEASE READ THE AGREEMENT CAREFULLY BEFORE DOWNLOADING THE SOFTWARE. BY EXERCISING THE OPTION TO DOWNLOAD THE SOFTWARE YOU AGREE TO BE BOUND BY THE TERMS OF THE AGREEMENT. SOFTWARE LICENCE AGREEMENT (EXCLUDING BSD COMPONENTS) 1.This Agreement pertains to a worldwide, non-exclusive, temporary, fully paid-up, royalty free, non-transferable, non-sub- licensable licence (the “Licence”) to use the elastic fusion source code, including any modification, part or derivative (the “Software”). Ownership and Licence. Your rights to use and download the Software onto your computer, and all other copies that You are authorised to make, are specified in this Agreement. However, we (or our licensors) retain all rights, including but not limited to all copyright and other intellectual property rights anywhere in the world, in the Software not expressly granted to You in this Agreement. 2. Permitted use of the Licence: (a) You may download and install the Software onto one computer or server for use in accordance with Clause 2(b) of this Agreement provided that You ensure that the Software is not accessible by other users unless they have themselves accepted the terms of this licence agreement. (b) You may use the Software solely for non-commercial, internal or academic research purposes and only in accordance with the terms of this Agreement. You may not use the Software for commercial purposes, including but not limited to (1) integration of all or part of the source code or the Software into a product for sale or licence by or on behalf of You to third parties or (2) use of the Software or any derivative of it for research to develop software products for sale or licence to a third party or (3) use of the Software or any derivative of it for research to develop non-software products for sale or licence to a third party, or (4) use of the Software to provide any service to an external organisation for which payment is received. Should You wish to use the Software for commercial purposes, You shall email researchcontracts.engineering@imperial.ac.uk . (c) Right to Copy. You may copy the Software for back-up and archival purposes, provided that each copy is kept in your possession and provided You reproduce our copyright notice (set out in Schedule 1) on each copy. (d) Transfer and sub-licensing. You may not rent, lend, or lease the Software and You may not transmit, transfer or sub-license this licence to use the Software or any of your rights or obligations under this Agreement to another party. (e) Identity of Licensee. The licence granted herein is personal to You. You shall not permit any third party to access, modify or otherwise use the Software nor shall You access modify or otherwise use the Software on behalf of any third party. If You wish to obtain a licence for mutiple users or a site licence for the Software please contact us at researchcontracts.engineering@imperial.ac.uk . (f) Publications and presentations. You may make public, results or data obtained from, dependent on or arising from research carried out using the Software, provided that any such presentation or publication identifies the Software as the source of the results or the data, including the Copyright Notice given in each element of the Software, and stating that the Software has been made available for use by You under licence from Imperial College London and You provide a copy of any such publication to Imperial College London. 3. Prohibited Uses. You may not, without written permission from us at researchcontracts.engineering@imperial.ac.uk : (a) Use, copy, modify, merge, or transfer copies of the Software or any documentation provided by us which relates to the Software except as provided in this Agreement; (b) Use any back-up or archival copies of the Software (or allow anyone else to use such copies) for any purpose other than to replace the original copy in the event it is destroyed or becomes defective; or (c) Disassemble, decompile or "unlock", reverse translate, or in any manner decode the Software for any reason. 4. Warranty Disclaimer (a) Disclaimer. The Software has been developed for research purposes only. You acknowledge that we are providing the Software to You under this licence agreement free of charge and on condition that the disclaimer set out below shall apply. We do not represent or warrant that the Software as to: (i) the quality, accuracy or reliability of the Software; (ii) the suitability of the Software for any particular use or for use under any specific conditions; and (iii) whether use of the Software will infringe third-party rights. You acknowledge that You have reviewed and evaluated the Software to determine that it meets your needs and that You assume all responsibility and liability for determining the suitability of the Software as fit for your particular purposes and requirements. Subject to Clause 4(b), we exclude and expressly disclaim all express and implied representations, warranties, conditions and terms not stated herein (including the implied conditions or warranties of satisfactory quality, merchantable quality, merchantability and fitness for purpose). (b) Savings. Some jurisdictions may imply warranties, conditions or terms or impose obligations upon us which cannot, in whole or in part, be excluded, restricted or modified or otherwise do not allow the exclusion of implied warranties, conditions or terms, in which case the above warranty disclaimer and exclusion will only apply to You to the extent permitted in the relevant jurisdiction and does not in any event exclude any implied warranties, conditions or terms which may not under applicable law be excluded. (c) Imperial College London disclaims all responsibility for the use which is made of the Software and any liability for the outcomes arising from using the Software. 5. Limitation of Liability (a) You acknowledge that we are providing the Software to You under this licence agreement free of charge and on condition that the limitation of liability set out below shall apply. Accordingly, subject to Clause 5(b), we exclude all liability whether in contract, tort, negligence or otherwise, in respect of the Software and/or any related documentation provided to You by us including, but not limited to, liability for loss or corruption of data, loss of contracts, loss of income, loss of profits, loss of cover and any consequential or indirect loss or damage of any kind arising out of or in connection with this licence agreement, however caused. This exclusion shall apply even if we have been advised of the possibility of such loss or damage. (b) You agree to indemnify Imperial College London and hold it harmless from and against any and all claims, damages and liabilities asserted by third parties (including claims for negligence) which arise directly or indirectly from the use of the Software or any derivative of it or the sale of any products based on the Software. You undertake to make no liability claim against any employee, student, agent or appointee of Imperial College London, in connection with this Licence or the Software. (c) Nothing in this Agreement shall have the effect of excluding or limiting our statutory liability. (d) Some jurisdictions do not allow these limitations or exclusions either wholly or in part, and, to that extent, they may not apply to you. Nothing in this licence agreement will affect your statutory rights or other relevant statutory provisions which cannot be excluded, restricted or modified, and its terms and conditions must be read and construed subject to any such statutory rights and/or provisions. 6. Confidentiality. You agree not to disclose any confidential information provided to You by us pursuant to this Agreement to any third party without our prior written consent. The obligations in this Clause 6 shall survive the termination of this Agreement for any reason. 7. Termination. (a) We may terminate this licence agreement and your right to use the Software at any time with immediate effect upon written notice to You. (b) This licence agreement and your right to use the Software automatically terminate if You: (i) fail to comply with any provisions of this Agreement; or (ii) destroy the copies of the Software in your possession, or voluntarily return the Software to us. (c) Upon termination You will destroy all copies of the Software. (d) Otherwise, the restrictions on your rights to use the Software will expire 10 (ten) years after first use of the Software under this licence agreement. 8. Miscellaneous Provisions. (a) This Agreement will be governed by and construed in accordance with the substantive laws of England and Wales whose courts shall have exclusive jurisdiction over all disputes which may arise between us. (b) This is the entire agreement between us relating to the Software, and supersedes any prior purchase order, communications, advertising or representations concerning the Software. (c) No change or modification of this Agreement will be valid unless it is in writing, and is signed by us. (d) The unenforceability or invalidity of any part of this Agreement will not affect the enforceability or validity of the remaining parts. BSD Elements of the Software For BSD elements of the Software, the following terms shall apply: Copyright as indicated in the header of the individual element of the Software. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SCHEDULE 1 The Software vMAP is an object-level mapping system with each object represented by a small MLP, that can be efficiently vectorised trained. It is based on the techniques described in the following publication: • Xin Kong, Shikun Liu, Marwan Taher, Andrew J. Davison. vMAP: Vectorised Object Mapping for Neural Field SLAM. ArXiv Preprint, 2023 _________________________ Acknowledgments If you use the software, you should reference the following paper in any publication: • Xin Kong, Shikun Liu, Marwan Taher, Andrew J. Davison. vMAP: Vectorised Object Mapping for Neural Field SLAM. ArXiv Preprint, 2023 ================================================ FILE: README.md ================================================ [comment]: <> (# vMAP: Vectorised Object Mapping for Neural Field SLAM)

vMAP: Vectorised Object Mapping for Neural Field SLAM

Xin Kong · Shikun Liu · Marwan Taher · Andrew Davison

[comment]: <> (

PAPER

)

Paper | Video | Project Page

Logo

vMAP builds an object-level map from a real-time RGB-D input stream. Each object is represented by a separate MLP neural field model, all optimised in parallel via vectorised training.


We provide the implementation of the following neural-field SLAM frameworks: - **vMAP** [Official Implementation] - **iMAP** [Simplified and Improved Re-Implementation, with depth guided sampling] ## Install First, let's start with a virtual environment with the required dependencies. ```bash conda env create -f environment.yml ``` ## Dataset Please download the following datasets to reproduce our results. * [Replica Demo](https://huggingface.co/datasets/kxic/vMAP/resolve/main/demo_replica_room_0.zip) - Replica Room 0 only for faster experimentation. * [Replica](https://huggingface.co/datasets/kxic/vMAP/resolve/main/vmap.zip) - All Pre-generated Replica sequences. For Replica data generation, please refer to directory `data_generation`. * [ScanNet](https://github.com/ScanNet/ScanNet) - Official ScanNet sequences. Each dataset contains a sequence of RGB-D images, as well as their corresponding camera poses, and object instance labels. To extract data from ScanNet .sens files, run ```bash conda activate py2 python2 reader.py --filename ~/data/ScanNet/scannet/scans/scene0024_00/scene0024_00.sens --output_path ~/data/ScanNet/objnerf/ --export_depth_images --export_color_images --export_poses --export_intrinsics ``` ## Config Then update the config files in `configs/.json` with your dataset paths, as well as other training hyper-parameters. ```json "dataset": { "path": "path/to/ims/folder/", } ``` ## Running vMAP / iMAP The following commands will run vMAP / iMAP in a single-thread setting. #### vMAP ```bash python ./train.py --config ./configs/Replica/config_replica_room0_vMAP.json --logdir ./logs/vMAP/room0 --save_ckpt True ``` #### iMAP ```bash python ./train.py --config ./configs/Replica/config_replica_room0_iMAP.json --logdir ./logs/iMAP/room0 --save_ckpt True ``` [comment]: <> (#### Multi thread demo) [comment]: <> (```bash) [comment]: <> (./parallel_train.py --config "config_file.json" --logdir ./logs) [comment]: <> (```) ## Evaluation To evaluate the quality of reconstructed scenes, we provide two different methods, #### 3D Scene-level Evaluation The same metrics following the original iMAP, to compare with GT scene meshes by **Accuracy**, **Completion** and **Completion Ratio**. ```bash python ./metric/eval_3D_scene.py ``` #### 3D Object-level Evaluation We also provide the object-level metrics by computing the same metrics but averaging across all objects in a scene. ```bash python ./metric/eval_3D_obj.py ``` [comment]: <> (### Novel View Synthesis) [comment]: <> (##### 2D Novel View Eval) [comment]: <> (We rendered a new trajectory in each scene and randomly choose novel view pose from it, evaluating 2D rendering performance) [comment]: <> (```bash) [comment]: <> (./metric/eval_2D_view.py) [comment]: <> (```) ## Results We provide raw results, including 3D meshes, 2D novel view rendering, and evaluated metrics of vMAP and iMAP* for easier comparison. * [Replica](https://huggingface.co/datasets/kxic/vMAP/resolve/main/vMAP_Replica_Results.zip) ## Acknowledgement We would like thank the following open-source repositories that we have build upon for the implementation of this work: [NICE-SLAM](https://github.com/cvg/nice-slam), and [functorch](https://github.com/pytorch/functorch). ## Citation If you found this code/work to be useful in your own research, please considering citing the following: ```bibtex @article{kong2023vmap, title={vMAP: Vectorised Object Mapping for Neural Field SLAM}, author={Kong, Xin and Liu, Shikun and Taher, Marwan and Davison, Andrew J}, journal={arXiv preprint arXiv:2302.01838}, year={2023} } ``` ```bibtex @inproceedings{sucar2021imap, title={iMAP: Implicit mapping and positioning in real-time}, author={Sucar, Edgar and Liu, Shikun and Ortiz, Joseph and Davison, Andrew J}, booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, pages={6229--6238}, year={2021} } ``` ================================================ FILE: cfg.py ================================================ import json import numpy as np import os import utils class Config: def __init__(self, config_file): # setting params with open(config_file) as json_file: config = json.load(json_file) # training strategy self.do_bg = bool(config["trainer"]["do_bg"]) self.training_device = config["trainer"]["train_device"] self.data_device = config["trainer"]["data_device"] self.max_n_models = config["trainer"]["n_models"] self.live_mode = bool(config["dataset"]["live"]) self.keep_live_time = config["dataset"]["keep_alive"] self.imap_mode = config["trainer"]["imap_mode"] self.training_strategy = config["trainer"]["training_strategy"] # "forloop" "vmap" self.obj_id = -1 # dataset setting self.dataset_format = config["dataset"]["format"] self.dataset_dir = config["dataset"]["path"] self.depth_scale = 1 / config["trainer"]["scale"] # camera setting self.max_depth = config["render"]["depth_range"][1] self.min_depth = config["render"]["depth_range"][0] self.mh = config["camera"]["mh"] self.mw = config["camera"]["mw"] self.height = config["camera"]["h"] self.width = config["camera"]["w"] self.H = self.height - 2 * self.mh self.W = self.width - 2 * self.mw if "fx" in config["camera"]: self.fx = config["camera"]["fx"] self.fy = config["camera"]["fy"] self.cx = config["camera"]["cx"] - self.mw self.cy = config["camera"]["cy"] - self.mh else: # for scannet intrinsic = utils.load_matrix_from_txt(os.path.join(self.dataset_dir, "intrinsic/intrinsic_depth.txt")) self.fx = intrinsic[0, 0] self.fy = intrinsic[1, 1] self.cx = intrinsic[0, 2] - self.mw self.cy = intrinsic[1, 2] - self.mh if "distortion" in config["camera"]: self.distortion_array = np.array(config["camera"]["distortion"]) elif "k1" in config["camera"]: k1 = config["camera"]["k1"] k2 = config["camera"]["k2"] k3 = config["camera"]["k3"] k4 = config["camera"]["k4"] k5 = config["camera"]["k5"] k6 = config["camera"]["k6"] p1 = config["camera"]["p1"] p2 = config["camera"]["p2"] self.distortion_array = np.array([k1, k2, p1, p2, k3, k4, k5, k6]) else: self.distortion_array = None # training setting self.win_size = config["model"]["window_size"] self.n_iter_per_frame = config["render"]["iters_per_frame"] self.n_per_optim = config["render"]["n_per_optim"] self.n_samples_per_frame = self.n_per_optim // self.win_size self.win_size_bg = config["model"]["window_size_bg"] self.n_per_optim_bg = config["render"]["n_per_optim_bg"] self.n_samples_per_frame_bg = self.n_per_optim_bg // self.win_size_bg self.keyframe_buffer_size = config["model"]["keyframe_buffer_size"] self.keyframe_step = config["model"]["keyframe_step"] self.keyframe_step_bg = config["model"]["keyframe_step_bg"] self.obj_scale = config["model"]["obj_scale"] self.bg_scale = config["model"]["bg_scale"] self.hidden_feature_size = config["model"]["hidden_feature_size"] self.hidden_feature_size_bg = config["model"]["hidden_feature_size_bg"] self.n_bins_cam2surface = config["render"]["n_bins_cam2surface"] self.n_bins_cam2surface_bg = config["render"]["n_bins_cam2surface_bg"] self.n_bins = config["render"]["n_bins"] self.n_unidir_funcs = config["model"]["n_unidir_funcs"] self.surface_eps = config["model"]["surface_eps"] self.stop_eps = config["model"]["other_eps"] # optimizer setting self.learning_rate = config["optimizer"]["args"]["lr"] self.weight_decay = config["optimizer"]["args"]["weight_decay"] # vis setting self.vis_device = config["vis"]["vis_device"] self.n_vis_iter = config["vis"]["n_vis_iter"] self.live_voxel_size = config["vis"]["live_voxel_size"] self.grid_dim = config["vis"]["grid_dim"] ================================================ FILE: configs/Replica/config_replica_room0_iMAP.json ================================================ { "dataset": { "live": 0, "path": "/home/xin/data/Replica/vmap/room_0/imap/00", "format": "Replica", "keep_alive": 20 }, "optimizer": { "args":{ "lr": 0.001, "weight_decay": 0.013, "pose_lr": 0.001 } }, "trainer": { "imap_mode": 1, "do_bg": 0, "n_models": 1, "train_device": "cuda:0", "data_device": "cuda:0", "training_strategy": "vmap", "epochs": 1000000, "scale": 1000.0 }, "render": { "depth_range": [0.0, 8.0], "n_bins": 9, "n_bins_cam2surface": 5, "n_bins_cam2surface_bg": 5, "iters_per_frame": 20, "n_per_optim": 4800, "n_per_optim_bg": 1200 }, "model": { "n_unidir_funcs": 5, "obj_scale": 5.0, "bg_scale": 5.0, "color_scaling": 5.0, "opacity_scaling": 10.0, "gt_scene": 1, "surface_eps": 0.1, "other_eps": 0.05, "keyframe_buffer_size": 20, "keyframe_step": 50, "keyframe_step_bg": 50, "window_size": 5, "window_size_bg": 10, "hidden_layers_block": 1, "hidden_feature_size": 256, "hidden_feature_size_bg": 128 }, "camera": { "w": 1200, "h": 680, "fx": 600.0, "fy": 600.0, "cx": 599.5, "cy": 339.5, "mw": 0, "mh": 0 }, "vis": { "vis_device": "cuda:0", "n_vis_iter": 500, "n_bins_fine_vis": 10, "im_vis_reduce": 10, "grid_dim": 256, "live_vis": 1, "live_voxel_size": 0.005 } } ================================================ FILE: configs/Replica/config_replica_room0_vMAP.json ================================================ { "dataset": { "live": 0, "path": "/home/xin/data/Replica/vmap/room_0/imap/00", "format": "Replica", "keep_alive": 20 }, "optimizer": { "args":{ "lr": 0.001, "weight_decay": 0.013, "pose_lr": 0.001 } }, "trainer": { "imap_mode": 0, "do_bg": 1, "n_models": 100, "train_device": "cuda:0", "data_device": "cuda:0", "training_strategy": "vmap", "epochs": 1000000, "scale": 1000.0 }, "render": { "depth_range": [0.0, 8.0], "n_bins": 9, "n_bins_cam2surface": 1, "n_bins_cam2surface_bg": 5, "iters_per_frame": 20, "n_per_optim": 120, "n_per_optim_bg": 1200 }, "model": { "n_unidir_funcs": 5, "obj_scale": 2.0, "bg_scale": 5.0, "color_scaling": 5.0, "opacity_scaling": 10.0, "gt_scene": 1, "surface_eps": 0.1, "other_eps": 0.05, "keyframe_buffer_size": 20, "keyframe_step": 25, "keyframe_step_bg": 50, "window_size": 5, "window_size_bg": 10, "hidden_layers_block": 1, "hidden_feature_size": 32, "hidden_feature_size_bg": 128 }, "camera": { "w": 1200, "h": 680, "fx": 600.0, "fy": 600.0, "cx": 599.5, "cy": 339.5, "mw": 0, "mh": 0 }, "vis": { "vis_device": "cuda:0", "n_vis_iter": 500, "n_bins_fine_vis": 10, "im_vis_reduce": 10, "grid_dim": 256, "live_vis": 1, "live_voxel_size": 0.005 } } ================================================ FILE: configs/ScanNet/config_scannet0000_iMAP.json ================================================ { "dataset": { "live": 0, "path": "/home/xin/data/ScanNet/NICESLAM/scene0000_00", "format": "ScanNet", "keep_alive": 20 }, "optimizer": { "args":{ "lr": 0.001, "weight_decay": 0.013, "pose_lr": 0.001 } }, "trainer": { "imap_mode": 1, "do_bg": 0, "n_models": 1, "train_device": "cuda:0", "data_device": "cuda:0", "training_strategy": "vmap", "epochs": 1000000, "scale": 1000.0 }, "render": { "depth_range": [0.0, 6.0], "n_bins": 9, "n_bins_cam2surface": 5, "n_bins_cam2surface_bg": 5, "iters_per_frame": 20, "n_per_optim": 2400, "n_per_optim_bg": 1200 }, "model": { "n_unidir_funcs": 5, "obj_scale": 3.0, "bg_scale": 15.0, "color_scaling": 5.0, "opacity_scaling": 10.0, "gt_scene": 1, "surface_eps": 0.1, "other_eps": 0.05, "keyframe_buffer_size": 20, "keyframe_step": 50, "keyframe_step_bg": 50, "window_size": 5, "window_size_bg": 10, "hidden_layers_block": 1, "hidden_feature_size": 256, "hidden_feature_size_bg": 128 }, "camera": { "w": 640, "h": 480, "mw": 10, "mh": 10 }, "vis": { "vis_device": "cuda:0", "n_vis_iter": 500, "n_bins_fine_vis": 10, "im_vis_reduce": 10, "grid_dim": 256, "live_vis": 1, "live_voxel_size": 0.005 } } ================================================ FILE: configs/ScanNet/config_scannet0000_vMAP.json ================================================ { "dataset": { "live": 0, "path": "/home/xin/data/ScanNet/NICESLAM/scene0000_00", "format": "ScanNet", "keep_alive": 20 }, "optimizer": { "args":{ "lr": 0.001, "weight_decay": 0.013, "pose_lr": 0.001 } }, "trainer": { "imap_mode": 0, "do_bg": 1, "n_models": 100, "train_device": "cuda:0", "data_device": "cuda:0", "training_strategy": "vmap", "epochs": 1000000, "scale": 1000.0 }, "render": { "depth_range": [0.0, 6.0], "n_bins": 9, "n_bins_cam2surface": 1, "n_bins_cam2surface_bg": 5, "iters_per_frame": 20, "n_per_optim": 120, "n_per_optim_bg": 1200 }, "model": { "n_unidir_funcs": 5, "obj_scale": 3.0, "bg_scale": 10.0, "color_scaling": 5.0, "opacity_scaling": 10.0, "gt_scene": 1, "surface_eps": 0.1, "other_eps": 0.05, "keyframe_buffer_size": 20, "keyframe_step": 25, "keyframe_step_bg": 50, "window_size": 5, "window_size_bg": 10, "hidden_layers_block": 1, "hidden_feature_size": 32, "hidden_feature_size_bg": 128 }, "camera": { "w": 640, "h": 480, "mw": 10, "mh": 10 }, "vis": { "vis_device": "cuda:0", "n_vis_iter": 10000000, "n_bins_fine_vis": 10, "im_vis_reduce": 10, "grid_dim": 256, "live_vis": 1, "live_voxel_size": 0.005 } } ================================================ FILE: configs/ScanNet/config_scannet0024_iMAP.json ================================================ { "dataset": { "live": 0, "path": "/home/xin/data/ScanNet/obj-imap/scene0024_00", "format": "ScanNet", "keep_alive": 20 }, "optimizer": { "args":{ "lr": 0.001, "weight_decay": 0.013, "pose_lr": 0.001 } }, "trainer": { "imap_mode": 1, "do_bg": 0, "n_models": 1, "train_device": "cuda:0", "data_device": "cuda:0", "training_strategy": "vmap", "epochs": 1000000, "scale": 1000.0 }, "render": { "depth_range": [0.0, 6.0], "n_bins": 9, "n_bins_cam2surface": 5, "n_bins_cam2surface_bg": 5, "iters_per_frame": 20, "n_per_optim": 2400, "n_per_optim_bg": 1200 }, "model": { "n_unidir_funcs": 5, "obj_scale": 2.0, "bg_scale": 5.0, "color_scaling": 5.0, "opacity_scaling": 10.0, "gt_scene": 1, "surface_eps": 0.1, "other_eps": 0.05, "keyframe_buffer_size": 20, "keyframe_step": 50, "keyframe_step_bg": 50, "window_size": 5, "window_size_bg": 10, "hidden_layers_block": 1, "hidden_feature_size": 256, "hidden_feature_size_bg": 128 }, "camera": { "w": 640, "h": 480, "mw": 10, "mh": 10 }, "vis": { "vis_device": "cuda:0", "n_vis_iter": 500, "n_bins_fine_vis": 10, "im_vis_reduce": 10, "grid_dim": 256, "live_vis": 1, "live_voxel_size": 0.005 } } ================================================ FILE: configs/ScanNet/config_scannet0024_vMAP.json ================================================ { "dataset": { "live": 0, "path": "/home/xin/data/ScanNet/obj-imap/scene0024_00", "format": "ScanNet", "keep_alive": 20 }, "optimizer": { "args":{ "lr": 0.001, "weight_decay": 0.013, "pose_lr": 0.001 } }, "trainer": { "imap_mode": 0, "do_bg": 1, "n_models": 100, "train_device": "cuda:0", "data_device": "cuda:0", "training_strategy": "vmap", "epochs": 1000000, "scale": 1000.0 }, "render": { "depth_range": [0.0, 6.0], "n_bins": 9, "n_bins_cam2surface": 1, "n_bins_cam2surface_bg": 5, "iters_per_frame": 20, "n_per_optim": 120, "n_per_optim_bg": 1200 }, "model": { "n_unidir_funcs": 5, "obj_scale": 3.0, "bg_scale": 10.0, "color_scaling": 5.0, "opacity_scaling": 10.0, "gt_scene": 1, "surface_eps": 0.1, "other_eps": 0.05, "keyframe_buffer_size": 20, "keyframe_step": 25, "keyframe_step_bg": 50, "window_size": 5, "window_size_bg": 10, "hidden_layers_block": 1, "hidden_feature_size": 32, "hidden_feature_size_bg": 128 }, "camera": { "w": 640, "h": 480, "mw": 10, "mh": 10 }, "vis": { "vis_device": "cuda:0", "n_vis_iter": 10000000, "n_bins_fine_vis": 10, "im_vis_reduce": 10, "grid_dim": 256, "live_vis": 1, "live_voxel_size": 0.005 } } ================================================ FILE: data_generation/README.md ================================================ ## Replica Data Generation ### Download Replica Dataset Download 3D models and info files from [Replica](https://github.com/facebookresearch/Replica-Dataset) ### 3D Object Mesh Extraction Change the input path in `./data_generation/extract_inst_obj.py` and run ```angular2html python ./data_generation/extract_inst_obj.py ``` ### Camera Trajectory Generation Please refer to [Semantic-NeRF](https://github.com/Harry-Zhi/semantic_nerf/issues/25#issuecomment-1340595427) for more details. The random trajectory generation only works for single room scene. For multiple rooms scene, collision checking is needed. Welcome contributions. ### Rendering 2D Images Given camera trajectory t_wc (change pose_file in configs), we use [Habitat-Sim](https://github.com/facebookresearch/habitat-sim) to render RGB, Depth, Semantic and Instance images. #### Install Habitat-Sim 0.2.1 We recommend to use conda to install habitat-sim 0.2.1. ```angular2html conda create -n habitat python=3.8.12 cmake=3.14.0 conda activate habitat conda install habitat-sim=0.2.1 withbullet -c conda-forge -c aihabitat conda install numba=0.54.1 ``` #### Run rendering with configs ```angular2html python ./data_generation/habitat_renderer.py --config ./data_generation/replica_render_config_vMAP.yaml ``` Note that to get HDR img, use mesh.ply not semantic_mesh.ply. Change path in configs. And copy rgb folder to replace previous high exposure rgb. ```angular2html python ./data_generation/habitat_renderer.py --config ./data_generation/replica_render_config_vMAP.yaml ``` ================================================ FILE: data_generation/extract_inst_obj.py ================================================ # reference https://github.com/facebookresearch/Replica-Dataset/issues/17#issuecomment-538757418 from plyfile import * import numpy as np import trimesh # path_in = 'path/to/mesh_semantic.ply' path_in = '/home/xin/data/vmap/room_0_debug/habitat/mesh_semantic.ply' print("Reading input...") mesh = trimesh.load(path_in) # mesh.show() file_in = PlyData.read(path_in) vertices_in = file_in.elements[0] faces_in = file_in.elements[1] print("Filtering data...") objects = {} sub_mesh_indices = {} for i, f in enumerate(faces_in): object_id = f[1] if not object_id in objects: objects[object_id] = [] sub_mesh_indices[object_id] = [] objects[object_id].append((f[0],)) sub_mesh_indices[object_id].append(i) sub_mesh_indices[object_id].append(i+faces_in.data.shape[0]) print("Writing data...") for object_id, faces in objects.items(): path_out = path_in + f"_{object_id}.ply" # print("sub_mesh_indices[object_id] ", sub_mesh_indices[object_id]) obj_mesh = mesh.submesh([sub_mesh_indices[object_id]], append=True) in_n = len(sub_mesh_indices[object_id]) out_n = obj_mesh.faces.shape[0] # print("obj id ", object_id) # print("in_n ", in_n) # print("out_n ", out_n) # print("faces ", len(faces)) # assert in_n == out_n obj_mesh.export(path_out) # faces_out = PlyElement.describe(np.array(faces, dtype=[('vertex_indices', 'O')]), 'face') # print("faces out ", len(PlyData([vertices_in, faces_out]).elements[1].data)) # PlyData([vertices_in, faces_out]).write(path_out+"_cmp.ply") ================================================ FILE: data_generation/habitat_renderer.py ================================================ #!/usr/bin/env python3 import os, sys, argparse os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" import cv2 import logging import habitat_sim as hs import numpy as np import quaternion import yaml import json from typing import Any, Dict, List, Tuple, Union from imgviz import label_colormap from PIL import Image import matplotlib.pyplot as plt import transformation import imgviz from datetime import datetime import time from settings import make_cfg # Custom type definitions Config = Dict[str, Any] Observation = hs.sensor.Observation Sim = hs.Simulator def init_habitat(config) : """Initialize the Habitat simulator with sensors and scene file""" _cfg = make_cfg(config) sim = Sim(_cfg) sim_cfg = hs.SimulatorConfiguration() sim_cfg.gpu_device_id = 0 # Note: all sensors must have the same resolution camera_resolution = [config["height"], config["width"]] sensors = { "color_sensor": { "sensor_type": hs.SensorType.COLOR, "resolution": camera_resolution, "position": [0.0, config["sensor_height"], 0.0], }, "depth_sensor": { "sensor_type": hs.SensorType.DEPTH, "resolution": camera_resolution, "position": [0.0, config["sensor_height"], 0.0], }, "semantic_sensor": { "sensor_type": hs.SensorType.SEMANTIC, "resolution": camera_resolution, "position": [0.0, config["sensor_height"], 0.0], }, } sensor_specs = [] for sensor_uuid, sensor_params in sensors.items(): if config[sensor_uuid]: sensor_spec = hs.SensorSpec() sensor_spec.uuid = sensor_uuid sensor_spec.sensor_type = sensor_params["sensor_type"] sensor_spec.resolution = sensor_params["resolution"] sensor_spec.position = sensor_params["position"] sensor_specs.append(sensor_spec) # Here you can specify the amount of displacement in a forward action and the turn angle agent_cfg = hs.agent.AgentConfiguration() agent_cfg.sensor_specifications = sensor_specs agent_cfg.action_space = { "move_forward": hs.agent.ActionSpec( "move_forward", hs.agent.ActuationSpec(amount=0.25) ), "turn_left": hs.agent.ActionSpec( "turn_left", hs.agent.ActuationSpec(amount=30.0) ), "turn_right": hs.agent.ActionSpec( "turn_right", hs.agent.ActuationSpec(amount=30.0) ), } hs_cfg = hs.Configuration(sim_cfg, [agent_cfg]) # sim = Sim(hs_cfg) if config["enable_semantics"]: # extract instance to class mapping function assert os.path.exists(config["instance2class_mapping"]) with open(config["instance2class_mapping"], "r") as f: annotations = json.load(f) instance_id_to_semantic_label_id = np.array(annotations["id_to_label"]) num_classes = len(annotations["classes"]) label_colour_map = label_colormap() config["instance2semantic"] = instance_id_to_semantic_label_id config["classes"] = annotations["classes"] config["objects"] = annotations["objects"] config["num_classes"] = num_classes config["label_colour_map"] = label_colormap() config["instance_colour_map"] = label_colormap(500) # add camera intrinsic # hfov = float(agent_cfg.sensor_specifications[0].parameters['hfov']) * np.pi / 180. # https://aihabitat.org/docs/habitat-api/view-transform-warp.html # config['K'] = K # config['K'] = np.array([[fx, 0.0, 0.0], [0.0, fx, 0.0], [0.0, 0.0, 1.0]], # dtype=np.float64) # hfov = float(agent_cfg.sensor_specifications[0].parameters['hfov']) # fx = 1.0 / np.tan(hfov / 2.0) # config['K'] = np.array([[fx, 0.0, 0.0], [0.0, fx, 0.0], [0.0, 0.0, 1.0]], # dtype=np.float64) # Get the intrinsic camera parameters logging.info('Habitat simulator initialized') return sim, hs_cfg, config def save_renders(save_path, observation, enable_semantic, suffix=""): save_path_rgb = os.path.join(save_path, "rgb") save_path_depth = os.path.join(save_path, "depth") save_path_sem_class = os.path.join(save_path, "semantic_class") save_path_sem_instance = os.path.join(save_path, "semantic_instance") if not os.path.exists(save_path_rgb): os.makedirs(save_path_rgb) if not os.path.exists(save_path_depth): os.makedirs(save_path_depth) if not os.path.exists(save_path_sem_class): os.makedirs(save_path_sem_class) if not os.path.exists(save_path_sem_instance): os.makedirs(save_path_sem_instance) cv2.imwrite(os.path.join(save_path_rgb, "rgb{}.png".format(suffix)), observation["color_sensor"][:,:,::-1]) # change from RGB to BGR for opencv write cv2.imwrite(os.path.join(save_path_depth, "depth{}.png".format(suffix)), observation["depth_sensor_mm"]) if enable_semantic: cv2.imwrite(os.path.join(save_path_sem_class, "semantic_class{}.png".format(suffix)), observation["semantic_class"]) cv2.imwrite(os.path.join(save_path_sem_class, "vis_sem_class{}.png".format(suffix)), observation["vis_sem_class"][:,:,::-1]) cv2.imwrite(os.path.join(save_path_sem_instance, "semantic_instance{}.png".format(suffix)), observation["semantic_instance"]) cv2.imwrite(os.path.join(save_path_sem_instance, "vis_sem_instance{}.png".format(suffix)), observation["vis_sem_instance"][:,:,::-1]) def render(sim, config): """Return the sensor observations and ground truth pose""" observation = sim.get_sensor_observations() # process rgb imagem change from RGBA to RGB observation['color_sensor'] = observation['color_sensor'][..., 0:3] rgb_img = observation['color_sensor'] # process depth depth_mm = (observation['depth_sensor'].copy()*1000).astype(np.uint16) # change meters to mm observation['depth_sensor_mm'] = depth_mm # process semantics if config['enable_semantics']: # Assuming the scene has no more than 65534 objects observation['semantic_instance'] = np.clip(observation['semantic_sensor'].astype(np.uint16), 0, 65535) # observation['semantic_instance'][observation['semantic_instance']==12]=0 # mask out certain instance # Convert instance IDs to class IDs # observation['semantic_classes'] = np.zeros(observation['semantic'].shape, dtype=np.uint8) # TODO make this conversion more efficient semantic_class = config["instance2semantic"][observation['semantic_instance']] semantic_class[semantic_class < 0] = 0 vis_sem_class = config["label_colour_map"][semantic_class] vis_sem_instance = config["instance_colour_map"][observation['semantic_instance']] # may cause error when having more than 255 instances in the scene observation['semantic_class'] = semantic_class.astype(np.uint8) observation["vis_sem_class"] = vis_sem_class.astype(np.uint8) observation["vis_sem_instance"] = vis_sem_instance.astype(np.uint8) # del observation["semantic_sensor"] # Get the camera ground truth pose (T_HC) in the habitat frame from the # position and orientation t_HC = sim.get_agent(0).get_state().position q_HC = sim.get_agent(0).get_state().rotation T_HC = transformation.combine_pose(t_HC, q_HC) observation['T_HC'] = T_HC observation['T_WC'] = transformation.Thc_to_Twc(T_HC) return observation def set_agent_position(sim, pose): # Move the agent R = pose[:3, :3] orientation_quat = quaternion.from_rotation_matrix(R) t = pose[:3, 3] position = t orientation = [orientation_quat.x, orientation_quat.y, orientation_quat.z, orientation_quat.w] agent = sim.get_agent(0) agent_state = hs.agent.AgentState(position, orientation) # agent.set_state(agent_state, reset_sensors=False) agent.set_state(agent_state) def main(): parser = argparse.ArgumentParser(description='Render Colour, Depth, Semantic, Instance labeling from Habitat-Simultation.') parser.add_argument('--config_file', type=str, default="./data_generation/replica_render_config_vMAP.yaml", help='the path to custom config file.') args = parser.parse_args() """Initialize the config dict and Habitat simulator""" # Read YAML file with open(args.config_file, 'r') as f: config = yaml.safe_load(f) config["save_path"] = os.path.join(config["save_path"]) if not os.path.exists(config["save_path"]): os.makedirs(config["save_path"]) T_wc = np.loadtxt(config["pose_file"]).reshape(-1, 4, 4) Ts_cam2world = T_wc print("-----Initialise and Set Habitat-Sim-----") sim, hs_cfg, config = init_habitat(config) # Set agent state sim.initialize_agent(config["default_agent"]) """Set agent state""" print("-----Render Images from Habitat-Sim-----") with open(os.path.join(config["save_path"], 'render_config.yaml'), 'w') as outfile: yaml.dump(config, outfile, default_flow_style=False) start_time = time.time() total_render_num = Ts_cam2world.shape[0] for i in range(total_render_num): if i % 100 == 0 : print("Rendering Process: {}/{}".format(i, total_render_num)) set_agent_position(sim, transformation.Twc_to_Thc(Ts_cam2world[i])) # replica mode observation = render(sim, config) save_renders(config["save_path"], observation, config["enable_semantics"], suffix="_{}".format(i)) end_time = time.time() print("-----Finish Habitat Rendering, Showing Trajectories.-----") print("Average rendering time per image is {} seconds.".format((end_time-start_time)/Ts_cam2world.shape[0])) if __name__ == "__main__": main() ================================================ FILE: data_generation/replica_render_config_vMAP.yaml ================================================ # Agent settings default_agent: 0 gpu_id: 0 width: 1200 #1280 height: 680 #960 sensor_height: 0 color_sensor: true # RGB sensor semantic_sensor: true # Semantic sensor depth_sensor: true # Depth sensor enable_semantics: true # room_0 scene_file: "/home/xin/data/vmap/room_0/habitat/mesh_semantic.ply" instance2class_mapping: "/home/xin/data/vmap/room_0/habitat/info_semantic.json" save_path: "/home/xin/data/vmap/room_0/vmap/00/" pose_file: "/home/xin/data/vmap/room_0/vmap/00/traj_w_c.txt" ## HDR texture ## issue https://github.com/facebookresearch/Replica-Dataset/issues/41#issuecomment-566251467 #scene_file: "/home/xin/data/vmap/room_0/mesh.ply" #instance2class_mapping: "/home/xin/data/vmap/room_0/habitat/info_semantic.json" #save_path: "/home/xin/data/vmap/room_0/vmap/00/" #pose_file: "/home/xin/data/vmap/room_0/vmap/00/traj_w_c.txt" ================================================ FILE: data_generation/settings.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import habitat_sim import habitat_sim.agent default_sim_settings = { # settings shared by example.py and benchmark.py "max_frames": 1000, "width": 640, "height": 480, "default_agent": 0, "sensor_height": 1.5, "hfov": 90, "color_sensor": True, # RGB sensor (default: ON) "semantic_sensor": False, # semantic sensor (default: OFF) "depth_sensor": False, # depth sensor (default: OFF) "ortho_rgba_sensor": False, # Orthographic RGB sensor (default: OFF) "ortho_depth_sensor": False, # Orthographic depth sensor (default: OFF) "ortho_semantic_sensor": False, # Orthographic semantic sensor (default: OFF) "fisheye_rgba_sensor": False, "fisheye_depth_sensor": False, "fisheye_semantic_sensor": False, "equirect_rgba_sensor": False, "equirect_depth_sensor": False, "equirect_semantic_sensor": False, "seed": 1, "silent": False, # do not print log info (default: OFF) # settings exclusive to example.py "save_png": False, # save the pngs to disk (default: OFF) "print_semantic_scene": False, "print_semantic_mask_stats": False, "compute_shortest_path": False, "compute_action_shortest_path": False, "scene": "data/scene_datasets/habitat-test-scenes/skokloster-castle.glb", "test_scene_data_url": "http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip", "goal_position": [5.047, 0.199, 11.145], "enable_physics": False, "enable_gfx_replay_save": False, "physics_config_file": "./data/default.physics_config.json", "num_objects": 10, "test_object_index": 0, "frustum_culling": True, } # build SimulatorConfiguration def make_cfg(settings): sim_cfg = habitat_sim.SimulatorConfiguration() if "frustum_culling" in settings: sim_cfg.frustum_culling = settings["frustum_culling"] else: sim_cfg.frustum_culling = False if "enable_physics" in settings: sim_cfg.enable_physics = settings["enable_physics"] if "physics_config_file" in settings: sim_cfg.physics_config_file = settings["physics_config_file"] # if not settings["silent"]: # print("sim_cfg.physics_config_file = " + sim_cfg.physics_config_file) if "scene_light_setup" in settings: sim_cfg.scene_light_setup = settings["scene_light_setup"] sim_cfg.gpu_device_id = 0 if not hasattr(sim_cfg, "scene_id"): raise RuntimeError( "Error: Please upgrade habitat-sim. SimulatorConfig API version mismatch" ) sim_cfg.scene_id = settings["scene_file"] # define default sensor parameters (see src/esp/Sensor/Sensor.h) sensor_specs = [] def create_camera_spec(**kw_args): camera_sensor_spec = habitat_sim.CameraSensorSpec() camera_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR camera_sensor_spec.resolution = [settings["height"], settings["width"]] camera_sensor_spec.position = [0, settings["sensor_height"], 0] for k in kw_args: setattr(camera_sensor_spec, k, kw_args[k]) return camera_sensor_spec if settings["color_sensor"]: color_sensor_spec = create_camera_spec( uuid="color_sensor", # hfov=settings["hfov"], sensor_type=habitat_sim.SensorType.COLOR, sensor_subtype=habitat_sim.SensorSubType.PINHOLE, ) sensor_specs.append(color_sensor_spec) if settings["depth_sensor"]: depth_sensor_spec = create_camera_spec( uuid="depth_sensor", # hfov=settings["hfov"], sensor_type=habitat_sim.SensorType.DEPTH, channels=1, sensor_subtype=habitat_sim.SensorSubType.PINHOLE, ) sensor_specs.append(depth_sensor_spec) if settings["semantic_sensor"]: semantic_sensor_spec = create_camera_spec( uuid="semantic_sensor", # hfov=settings["hfov"], sensor_type=habitat_sim.SensorType.SEMANTIC, channels=1, sensor_subtype=habitat_sim.SensorSubType.PINHOLE, ) sensor_specs.append(semantic_sensor_spec) # if settings["ortho_rgba_sensor"]: # ortho_rgba_sensor_spec = create_camera_spec( # uuid="ortho_rgba_sensor", # sensor_type=habitat_sim.SensorType.COLOR, # sensor_subtype=habitat_sim.SensorSubType.ORTHOGRAPHIC, # ) # sensor_specs.append(ortho_rgba_sensor_spec) # # if settings["ortho_depth_sensor"]: # ortho_depth_sensor_spec = create_camera_spec( # uuid="ortho_depth_sensor", # sensor_type=habitat_sim.SensorType.DEPTH, # channels=1, # sensor_subtype=habitat_sim.SensorSubType.ORTHOGRAPHIC, # ) # sensor_specs.append(ortho_depth_sensor_spec) # # if settings["ortho_semantic_sensor"]: # ortho_semantic_sensor_spec = create_camera_spec( # uuid="ortho_semantic_sensor", # sensor_type=habitat_sim.SensorType.SEMANTIC, # channels=1, # sensor_subtype=habitat_sim.SensorSubType.ORTHOGRAPHIC, # ) # sensor_specs.append(ortho_semantic_sensor_spec) # TODO Figure out how to implement copying of specs def create_fisheye_spec(**kw_args): fisheye_sensor_spec = habitat_sim.FisheyeSensorDoubleSphereSpec() fisheye_sensor_spec.uuid = "fisheye_sensor" fisheye_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR fisheye_sensor_spec.sensor_model_type = ( habitat_sim.FisheyeSensorModelType.DOUBLE_SPHERE ) # The default value (alpha, xi) is set to match the lens "GoPro" found in Table 3 of this paper: # Vladyslav Usenko, Nikolaus Demmel and Daniel Cremers: The Double Sphere # Camera Model, The International Conference on 3D Vision (3DV), 2018 # You can find the intrinsic parameters for the other lenses in the same table as well. fisheye_sensor_spec.xi = -0.27 fisheye_sensor_spec.alpha = 0.57 fisheye_sensor_spec.focal_length = [364.84, 364.86] fisheye_sensor_spec.resolution = [settings["height"], settings["width"]] # The default principal_point_offset is the middle of the image fisheye_sensor_spec.principal_point_offset = None # default: fisheye_sensor_spec.principal_point_offset = [i/2 for i in fisheye_sensor_spec.resolution] fisheye_sensor_spec.position = [0, settings["sensor_height"], 0] for k in kw_args: setattr(fisheye_sensor_spec, k, kw_args[k]) return fisheye_sensor_spec # if settings["fisheye_rgba_sensor"]: # fisheye_rgba_sensor_spec = create_fisheye_spec(uuid="fisheye_rgba_sensor") # sensor_specs.append(fisheye_rgba_sensor_spec) # if settings["fisheye_depth_sensor"]: # fisheye_depth_sensor_spec = create_fisheye_spec( # uuid="fisheye_depth_sensor", # sensor_type=habitat_sim.SensorType.DEPTH, # channels=1, # ) # sensor_specs.append(fisheye_depth_sensor_spec) # if settings["fisheye_semantic_sensor"]: # fisheye_semantic_sensor_spec = create_fisheye_spec( # uuid="fisheye_semantic_sensor", # sensor_type=habitat_sim.SensorType.SEMANTIC, # channels=1, # ) # sensor_specs.append(fisheye_semantic_sensor_spec) def create_equirect_spec(**kw_args): equirect_sensor_spec = habitat_sim.EquirectangularSensorSpec() equirect_sensor_spec.uuid = "equirect_rgba_sensor" equirect_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR equirect_sensor_spec.resolution = [settings["height"], settings["width"]] equirect_sensor_spec.position = [0, settings["sensor_height"], 0] for k in kw_args: setattr(equirect_sensor_spec, k, kw_args[k]) return equirect_sensor_spec # if settings["equirect_rgba_sensor"]: # equirect_rgba_sensor_spec = create_equirect_spec(uuid="equirect_rgba_sensor") # sensor_specs.append(equirect_rgba_sensor_spec) # # if settings["equirect_depth_sensor"]: # equirect_depth_sensor_spec = create_equirect_spec( # uuid="equirect_depth_sensor", # sensor_type=habitat_sim.SensorType.DEPTH, # channels=1, # ) # sensor_specs.append(equirect_depth_sensor_spec) # # if settings["equirect_semantic_sensor"]: # equirect_semantic_sensor_spec = create_equirect_spec( # uuid="equirect_semantic_sensor", # sensor_type=habitat_sim.SensorType.SEMANTIC, # channels=1, # ) # sensor_specs.append(equirect_semantic_sensor_spec) # create agent specifications agent_cfg = habitat_sim.agent.AgentConfiguration() agent_cfg.sensor_specifications = sensor_specs agent_cfg.action_space = { "move_forward": habitat_sim.agent.ActionSpec( "move_forward", habitat_sim.agent.ActuationSpec(amount=0.25) ), "turn_left": habitat_sim.agent.ActionSpec( "turn_left", habitat_sim.agent.ActuationSpec(amount=10.0) ), "turn_right": habitat_sim.agent.ActionSpec( "turn_right", habitat_sim.agent.ActuationSpec(amount=10.0) ), } # override action space to no-op to test physics if sim_cfg.enable_physics: agent_cfg.action_space = { "move_forward": habitat_sim.agent.ActionSpec( "move_forward", habitat_sim.agent.ActuationSpec(amount=0.0) ) } return habitat_sim.Configuration(sim_cfg, [agent_cfg]) ================================================ FILE: data_generation/transformation.py ================================================ import numpy as np import quaternion import trimesh def habitat_world_transformations(): import habitat_sim # Transforms between the habitat frame H (y-up) and the world frame W (z-up). T_wh = np.identity(4) # https://stackoverflow.com/questions/1171849/finding-quaternion-representing-the-rotation-from-one-vector-to-another T_wh[0:3, 0:3] = quaternion.as_rotation_matrix(habitat_sim.utils.common.quat_from_two_vectors( habitat_sim.geo.GRAVITY, np.array([0.0, 0.0, -1.0]))) T_hw = np.linalg.inv(T_wh) return T_wh, T_hw def opencv_to_opengl_camera(transform=None): if transform is None: transform = np.eye(4) return transform @ trimesh.transformations.rotation_matrix( np.deg2rad(180), [1, 0, 0] ) def opengl_to_opencv_camera(transform=None): if transform is None: transform = np.eye(4) return transform @ trimesh.transformations.rotation_matrix( np.deg2rad(-180), [1, 0, 0] ) def Twc_to_Thc(T_wc): # opencv-camera to world transformation ---> habitat-caemra to habitat world transformation T_wh, T_hw = habitat_world_transformations() T_hc = T_hw @ T_wc @ opengl_to_opencv_camera() return T_hc def Thc_to_Twc(T_hc): # habitat-caemra to habitat world transformation ---> opencv-camera to world transformation T_wh, T_hw = habitat_world_transformations() T_wc = T_wh @ T_hc @ opencv_to_opengl_camera() return T_wc def combine_pose(t: np.array, q: quaternion.quaternion) -> np.array: T = np.identity(4) T[0:3, 3] = t T[0:3, 0:3] = quaternion.as_rotation_matrix(q) return T ================================================ FILE: dataset.py ================================================ import imgviz from torch.utils.data import Dataset, DataLoader import torch import numpy as np import cv2 import os from utils import enlarge_bbox, get_bbox2d, get_bbox2d_batch, box_filter import glob from torchvision import transforms import image_transforms import open3d import time def next_live_data(track_to_map_IDT, inited): while True: if track_to_map_IDT.empty(): if inited: return None # no new frame, use kf buffer else: # blocking until get the first frame continue else: Buffer_data = track_to_map_IDT.get(block=False) break if Buffer_data is not None: image, depth, T, obj, bbox_dict, kf_id = Buffer_data del Buffer_data T_obj = torch.eye(4) sample = {"image": image, "depth": depth, "T": T, "T_obj": T_obj, "obj": obj, "bbox_dict": bbox_dict, "frame_id": kf_id} return sample else: print("getting nothing?") exit(-1) # return None def init_loader(cfg, multi_worker=True): if cfg.dataset_format == "Replica": dataset = Replica(cfg) elif cfg.dataset_format == "ScanNet": dataset = ScanNet(cfg) else: print("Dataset format {} not found".format(cfg.dataset_format)) exit(-1) # init dataloader if multi_worker: # multi worker loader dataloader = DataLoader(dataset, batch_size=None, shuffle=False, sampler=None, batch_sampler=None, num_workers=4, collate_fn=None, pin_memory=True, drop_last=False, timeout=0, worker_init_fn=None, generator=None, prefetch_factor=2, persistent_workers=True) else: # single worker loader dataloader = DataLoader(dataset, batch_size=None, shuffle=False, sampler=None, batch_sampler=None, num_workers=0) return dataloader class Replica(Dataset): def __init__(self, cfg): self.imap_mode = cfg.imap_mode self.root_dir = cfg.dataset_dir traj_file = os.path.join(self.root_dir, "traj_w_c.txt") self.Twc = np.loadtxt(traj_file, delimiter=" ").reshape([-1, 4, 4]) self.depth_transform = transforms.Compose( [image_transforms.DepthScale(cfg.depth_scale), image_transforms.DepthFilter(cfg.max_depth)]) # background semantic classes: undefined--1, undefined-0 beam-5 blinds-12 curtain-30 ceiling-31 floor-40 pillar-60 vent-92 wall-93 wall-plug-95 window-97 rug-98 self.background_cls_list = [5,12,30,31,40,60,92,93,95,97,98,79] # Not sure: door-37 handrail-43 lamp-47 pipe-62 rack-66 shower-stall-73 stair-77 switch-79 wall-cabinet-94 picture-59 self.bbox_scale = 0.2 # 1 #1.5 0.9== s=1/9, s=0.2 def __len__(self): return len(os.listdir(os.path.join(self.root_dir, "depth"))) def __getitem__(self, idx): bbox_dict = {} rgb_file = os.path.join(self.root_dir, "rgb", "rgb_" + str(idx) + ".png") depth_file = os.path.join(self.root_dir, "depth", "depth_" + str(idx) + ".png") inst_file = os.path.join(self.root_dir, "semantic_instance", "semantic_instance_" + str(idx) + ".png") obj_file = os.path.join(self.root_dir, "semantic_class", "semantic_class_" + str(idx) + ".png") depth = cv2.imread(depth_file, -1).astype(np.float32).transpose(1,0) image = cv2.imread(rgb_file).astype(np.uint8) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).transpose(1,0,2) obj = cv2.imread(obj_file, cv2.IMREAD_UNCHANGED).astype(np.int32).transpose(1,0) # uint16 -> int32 inst = cv2.imread(inst_file, cv2.IMREAD_UNCHANGED).astype(np.int32).transpose(1,0) # uint16 -> int32 bbox_scale = self.bbox_scale if self.imap_mode: obj = np.zeros_like(obj) else: obj_ = np.zeros_like(obj) inst_list = [] batch_masks = [] for inst_id in np.unique(inst): inst_mask = inst == inst_id # if np.sum(inst_mask) <= 2000: # too small 20 400 # continue sem_cls = np.unique(obj[inst_mask]) # sem label, only interested obj assert sem_cls.shape[0] != 0 if sem_cls in self.background_cls_list: continue obj_mask = inst == inst_id batch_masks.append(obj_mask) inst_list.append(inst_id) if len(batch_masks) > 0: batch_masks = torch.from_numpy(np.stack(batch_masks)) cmins, cmaxs, rmins, rmaxs = get_bbox2d_batch(batch_masks) for i in range(batch_masks.shape[0]): w = rmaxs[i] - rmins[i] h = cmaxs[i] - cmins[i] if w <= 10 or h <= 10: # too small todo continue bbox_enlarged = enlarge_bbox([rmins[i], cmins[i], rmaxs[i], cmaxs[i]], scale=bbox_scale, w=obj.shape[1], h=obj.shape[0]) # inst_list.append(inst_id) inst_id = inst_list[i] obj_[batch_masks[i]] = 1 # bbox_dict.update({int(inst_id): torch.from_numpy(np.array(bbox_enlarged).reshape(-1))}) # batch format bbox_dict.update({inst_id: torch.from_numpy(np.array( [bbox_enlarged[1], bbox_enlarged[3], bbox_enlarged[0], bbox_enlarged[2]]))}) # bbox order inst[obj_ == 0] = 0 # for background obj = inst bbox_dict.update({0: torch.from_numpy(np.array([int(0), int(obj.shape[0]), 0, int(obj.shape[1])]))}) # bbox order T = self.Twc[idx] # could change to ORB-SLAM pose or else T_obj = np.eye(4) # obj pose, if dynamic sample = {"image": image, "depth": depth, "T": T, "T_obj": T_obj, "obj": obj, "bbox_dict": bbox_dict, "frame_id": idx} if image is None or depth is None: print(rgb_file) print(depth_file) raise ValueError if self.depth_transform: sample["depth"] = self.depth_transform(sample["depth"]) return sample class ScanNet(Dataset): def __init__(self, cfg): self.imap_mode = cfg.imap_mode self.root_dir = cfg.dataset_dir self.color_paths = sorted(glob.glob(os.path.join( self.root_dir, 'color', '*.jpg')), key=lambda x: int(os.path.basename(x)[:-4])) self.depth_paths = sorted(glob.glob(os.path.join( self.root_dir, 'depth', '*.png')), key=lambda x: int(os.path.basename(x)[:-4])) self.inst_paths = sorted(glob.glob(os.path.join( self.root_dir, 'instance-filt', '*.png')), key=lambda x: int(os.path.basename(x)[:-4])) # instance-filt self.sem_paths = sorted(glob.glob(os.path.join( self.root_dir, 'label-filt', '*.png')), key=lambda x: int(os.path.basename(x)[:-4])) # label-filt self.load_poses(os.path.join(self.root_dir, 'pose')) self.n_img = len(self.color_paths) self.depth_transform = transforms.Compose( [image_transforms.DepthScale(cfg.depth_scale), image_transforms.DepthFilter(cfg.max_depth)]) # self.rgb_transform = rgb_transform self.W = cfg.W self.H = cfg.H self.fx = cfg.fx self.fy = cfg.fy self.cx = cfg.cx self.cy = cfg.cy self.edge = cfg.mw self.intrinsic_open3d = open3d.camera.PinholeCameraIntrinsic( width=self.W, height=self.H, fx=self.fx, fy=self.fy, cx=self.cx, cy=self.cy, ) self.min_pixels = 1500 # from scannetv2-labels.combined.tsv #1-wall, 3-floor, 16-window, 41-ceiling, 232-light switch 0-unknown? 21-pillar 161-doorframe, shower walls-128, curtain-21, windowsill-141 self.background_cls_list = [-1, 0, 1, 3, 16, 41, 232, 21, 161, 128, 21] self.bbox_scale = 0.2 self.inst_dict = {} def load_poses(self, path): self.poses = [] pose_paths = sorted(glob.glob(os.path.join(path, '*.txt')), key=lambda x: int(os.path.basename(x)[:-4])) for pose_path in pose_paths: with open(pose_path, "r") as f: lines = f.readlines() ls = [] for line in lines: l = list(map(float, line.split(' '))) ls.append(l) c2w = np.array(ls).reshape(4, 4) self.poses.append(c2w) def __len__(self): return self.n_img def __getitem__(self, index): bbox_scale = self.bbox_scale color_path = self.color_paths[index] depth_path = self.depth_paths[index] inst_path = self.inst_paths[index] sem_path = self.sem_paths[index] color_data = cv2.imread(color_path).astype(np.uint8) color_data = cv2.cvtColor(color_data, cv2.COLOR_BGR2RGB) depth_data = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) depth_data = np.nan_to_num(depth_data, nan=0.) T = None if self.poses is not None: T = self.poses[index] if np.any(np.isinf(T)): if index + 1 == self.__len__(): print("pose inf!") return None return self.__getitem__(index + 1) H, W = depth_data.shape color_data = cv2.resize(color_data, (W, H), interpolation=cv2.INTER_LINEAR) if self.edge: edge = self.edge # crop image edge, there are invalid value on the edge of the color image color_data = color_data[edge:-edge, edge:-edge] depth_data = depth_data[edge:-edge, edge:-edge] if self.depth_transform: depth_data = self.depth_transform(depth_data) bbox_dict = {} if self.imap_mode: inst_data = np.zeros_like(depth_data).astype(np.int32) else: inst_data = cv2.imread(inst_path, cv2.IMREAD_UNCHANGED) inst_data = cv2.resize(inst_data, (W, H), interpolation=cv2.INTER_NEAREST).astype(np.int32) sem_data = cv2.imread(sem_path, cv2.IMREAD_UNCHANGED)#.astype(np.int32) sem_data = cv2.resize(sem_data, (W, H), interpolation=cv2.INTER_NEAREST) if self.edge: edge = self.edge inst_data = inst_data[edge:-edge, edge:-edge] sem_data = sem_data[edge:-edge, edge:-edge] inst_data += 1 # shift from 0->1 , 0 is for background # box filter track_start = time.time() masks = [] classes = [] # convert to list of arrays obj_ids = np.unique(inst_data) for obj_id in obj_ids: mask = inst_data == obj_id sem_cls = np.unique(sem_data[mask]) if sem_cls in self.background_cls_list: inst_data[mask] = 0 # set to background continue masks.append(mask) classes.append(obj_id) T_CW = np.linalg.inv(T) inst_data = box_filter(masks, classes, depth_data, self.inst_dict, self.intrinsic_open3d, T_CW, min_pixels=self.min_pixels) merged_obj_ids = np.unique(inst_data) for obj_id in merged_obj_ids: mask = inst_data == obj_id bbox2d = get_bbox2d(mask, bbox_scale=bbox_scale) if bbox2d is None: inst_data[mask] = 0 # set to bg else: min_x, min_y, max_x, max_y = bbox2d bbox_dict.update({int(obj_id): torch.from_numpy(np.array([min_x, max_x, min_y, max_y]).reshape(-1))}) # batch format bbox_time = time.time() # print("bbox time ", bbox_time - filter_time) cv2.imshow("inst", imgviz.label2rgb(inst_data)) cv2.waitKey(1) print("frame {} track time {}".format(index, bbox_time-track_start)) bbox_dict.update({0: torch.from_numpy(np.array([int(0), int(inst_data.shape[1]), 0, int(inst_data.shape[0])]))}) # bbox order # wrap data to frame dict T_obj = np.identity(4) sample = {"image": color_data.transpose(1,0,2), "depth": depth_data.transpose(1,0), "T": T, "T_obj": T_obj} if color_data is None or depth_data is None: print(color_path) print(depth_path) raise ValueError sample.update({"obj": inst_data.transpose(1,0)}) sample.update({"bbox_dict": bbox_dict}) return sample ================================================ FILE: embedding.py ================================================ import torch import numpy as np def positional_encoding( tensor, B_layer=None, num_encoding_functions=6, scale=10. ): if B_layer is not None: embedding_gauss = B_layer(tensor / scale) embedding_gauss = torch.sin(embedding_gauss) embedding = embedding_gauss else: frequency_bands = 2.0 ** torch.linspace( 0.0, num_encoding_functions - 1, num_encoding_functions, dtype=tensor.dtype, device=tensor.device, ) n_repeat = num_encoding_functions * 2 + 1 embedding = tensor[..., None, :].repeat(1, 1, n_repeat, 1) / scale even_idx = np.arange(1, num_encoding_functions + 1) * 2 odd_idx = even_idx - 1 frequency_bands = frequency_bands[None, None, :, None] embedding[:, :, even_idx, :] = torch.cos( frequency_bands * embedding[:, :, even_idx, :]) embedding[:, :, odd_idx, :] = torch.sin( frequency_bands * embedding[:, :, odd_idx, :]) n_dim = tensor.shape[-1] embedding = embedding.view( embedding.shape[0], embedding.shape[1], n_repeat * n_dim) # print("embedding ", embedding.shape) embedding = embedding.squeeze(0) return embedding class UniDirsEmbed(torch.nn.Module): def __init__(self, min_deg=0, max_deg=2, scale=2.): super(UniDirsEmbed, self).__init__() self.min_deg = min_deg self.max_deg = max_deg self.n_freqs = max_deg - min_deg + 1 self.tensor_scale = torch.tensor(scale, requires_grad=False) dirs = torch.tensor([ 0.8506508, 0, 0.5257311, 0.809017, 0.5, 0.309017, 0.5257311, 0.8506508, 0, 1, 0, 0, 0.809017, 0.5, -0.309017, 0.8506508, 0, -0.5257311, 0.309017, 0.809017, -0.5, 0, 0.5257311, -0.8506508, 0.5, 0.309017, -0.809017, 0, 1, 0, -0.5257311, 0.8506508, 0, -0.309017, 0.809017, -0.5, 0, 0.5257311, 0.8506508, -0.309017, 0.809017, 0.5, 0.309017, 0.809017, 0.5, 0.5, 0.309017, 0.809017, 0.5, -0.309017, 0.809017, 0, 0, 1, -0.5, 0.309017, 0.809017, -0.809017, 0.5, 0.309017, -0.809017, 0.5, -0.309017 ]).reshape(-1, 3) self.B_layer = torch.nn.Linear(3, 21, bias=False) self.B_layer.weight.data = dirs frequency_bands = 2.0 ** torch.linspace(self.min_deg, self.max_deg, self.n_freqs) self.register_buffer("frequency_bands", frequency_bands, persistent=False) self.register_buffer("scale", self.tensor_scale, persistent=True) def forward(self, x): tensor = x / self.scale # functorch needs buffer, otherwise changed proj = self.B_layer(tensor) proj_bands = proj[..., None, :] * self.frequency_bands[None, None, :, None] xb = proj_bands.view(list(proj.shape[:-1]) + [-1]) # embedding = torch.sin(torch.cat([xb, xb + 0.5 * np.pi], dim=-1)) embedding = torch.sin(xb * np.pi) embedding = torch.cat([tensor] + [embedding], dim=-1) # print("emb size ", embedding.shape) return embedding ================================================ FILE: environment.yml ================================================ name: vmap channels: - pytorch - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - blas=1.0=mkl - brotlipy=0.7.0=py38h27cfd23_1003 - bzip2=1.0.8=h7b6447c_0 - ca-certificates=2022.10.11=h06a4308_0 - certifi=2022.9.24=py38h06a4308_0 - cffi=1.15.1=py38h5eee18b_2 - charset-normalizer=2.0.4=pyhd3eb1b0_0 - cryptography=38.0.1=py38h9ce1e76_0 - cudatoolkit=11.3.1=h2bc3f7f_2 - ffmpeg=4.3=hf484d3e_0 - freetype=2.12.1=h4a9f257_0 - giflib=5.2.1=h7b6447c_0 - gmp=6.2.1=h295c915_3 - gnutls=3.6.15=he1e5248_0 - idna=3.4=py38h06a4308_0 - intel-openmp=2021.4.0=h06a4308_3561 - jpeg=9e=h7f8727e_0 - lame=3.100=h7b6447c_0 - lcms2=2.12=h3be6417_0 - ld_impl_linux-64=2.38=h1181459_1 - lerc=3.0=h295c915_0 - libdeflate=1.8=h7f8727e_5 - libffi=3.4.2=h6a678d5_6 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libiconv=1.16=h7f8727e_2 - libidn2=2.3.2=h7f8727e_0 - libpng=1.6.37=hbc83047_0 - libstdcxx-ng=11.2.0=h1234567_1 - libtasn1=4.16.0=h27cfd23_0 - libtiff=4.4.0=hecacb30_2 - libunistring=0.9.10=h27cfd23_0 - libwebp=1.2.4=h11a3e52_0 - libwebp-base=1.2.4=h5eee18b_0 - lz4-c=1.9.3=h295c915_1 - mkl=2021.4.0=h06a4308_640 - mkl-service=2.4.0=py38h7f8727e_0 - mkl_fft=1.3.1=py38hd3c417c_0 - mkl_random=1.2.2=py38h51133e4_0 - ncurses=6.3=h5eee18b_3 - nettle=3.7.3=hbbd107a_1 - numpy=1.23.4=py38h14f4228_0 - numpy-base=1.23.4=py38h31eccc5_0 - openh264=2.1.1=h4ff587b_0 - openssl=1.1.1s=h7f8727e_0 - pillow=9.2.0=py38hace64e9_1 - pip=22.2.2=py38h06a4308_0 - pycparser=2.21=pyhd3eb1b0_0 - pyopenssl=22.0.0=pyhd3eb1b0_0 - pysocks=1.7.1=py38h06a4308_0 - python=3.8.15=h7a1cb2a_2 - pytorch=1.12.1=py3.8_cuda11.3_cudnn8.3.2_0 - pytorch-mutex=1.0=cuda - readline=8.2=h5eee18b_0 - requests=2.28.1=py38h06a4308_0 - setuptools=65.5.0=py38h06a4308_0 - six=1.16.0=pyhd3eb1b0_1 - sqlite=3.40.0=h5082296_0 - tk=8.6.12=h1ccaba5_0 - torchaudio=0.12.1=py38_cu113 - torchvision=0.13.1=py38_cu113 - typing_extensions=4.3.0=py38h06a4308_0 - urllib3=1.26.12=py38h06a4308_0 - wheel=0.37.1=pyhd3eb1b0_0 - xz=5.2.6=h5eee18b_0 - zlib=1.2.13=h5eee18b_0 - zstd=1.5.2=ha4553b6_0 - pip: - antlr4-python3-runtime==4.9.3 - bidict==0.22.0 - click==8.1.3 - dash==2.7.0 - dash-core-components==2.0.0 - dash-html-components==2.0.0 - dash-table==5.0.0 - entrypoints==0.4 - flask==2.2.2 - functorch==0.2.0 - fvcore==0.1.5.post20221122 - h5py==3.7.0 - imageio==2.22.4 - importlib-metadata==5.1.0 - iopath==0.1.10 - itsdangerous==2.1.2 - lpips==0.1.4 - nbformat==5.5.0 - omegaconf==2.2.3 - open3d==0.16.0 - pexpect==4.8.0 - plotly==5.11.0 - pycocotools==2.0.6 - pyquaternion==0.9.9 - scikit-image==0.19.3 - tenacity==8.1.0 - termcolor==2.1.1 - timm==0.6.12 - werkzeug==2.2.2 - yacs==0.1.8 - zipp==3.11.0 ================================================ FILE: image_transforms.py ================================================ import cv2 import numpy as np class BGRtoRGB(object): """bgr format to rgb""" def __call__(self, image): image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) return image class DepthScale(object): """scale depth to meters""" def __init__(self, scale): self.scale = scale def __call__(self, depth): depth = depth.astype(np.float32) return depth * self.scale class DepthFilter(object): """scale depth to meters""" def __init__(self, max_depth): self.max_depth = max_depth def __call__(self, depth): far_mask = depth > self.max_depth depth[far_mask] = 0. return depth class Undistort(object): """scale depth to meters""" def __init__(self, w, h, fx, fy, cx, cy, k1, k2, k3, k4, k5, k6, p1, p2, interpolation): self.interpolation = interpolation K = np.array([[fx, 0., cx], [0., fy, cy], [0., 0., 1.]]) self.map1x, self.map1y = cv2.initUndistortRectifyMap( K, np.array([k1, k2, p1, p2, k3, k4, k5, k6]), np.eye(3), K, (w, h), cv2.CV_32FC1) def __call__(self, im): im = cv2.remap(im, self.map1x, self.map1y, self.interpolation) return im ================================================ FILE: loss.py ================================================ import torch import render_rays import torch.nn.functional as F def step_batch_loss(alpha, color, gt_depth, gt_color, sem_labels, mask_depth, z_vals, color_scaling=5.0, opacity_scaling=10.0): """ apply depth where depth are valid -> mask_depth apply depth, color loss on this_obj & unkown_obj == (~other_obj) -> mask_obj apply occupancy/opacity loss on this_obj & other_obj == (~unknown_obj) -> mask_sem output: loss for training loss_all for per sample, could be used for active sampling, replay buffer """ mask_obj = sem_labels != 0 mask_obj = mask_obj.detach() mask_sem = sem_labels != 2 mask_sem = mask_sem.detach() alpha = alpha.squeeze(dim=-1) color = color.squeeze(dim=-1) occupancy = render_rays.occupancy_activation(alpha) termination = render_rays.occupancy_to_termination(occupancy, is_batch=True) # shape [num_batch, num_ray, points_per_ray] render_depth = render_rays.render(termination, z_vals) diff_sq = (z_vals - render_depth[..., None]) ** 2 var = render_rays.render(termination, diff_sq).detach() # must detach here! render_color = render_rays.render(termination[..., None], color, dim=-2) render_opacity = torch.sum(termination, dim=-1) # similar to obj-nerf opacity loss # 2D depth loss: only on valid depth & mask # [mask_depth & mask_obj] # loss_all = torch.zeros_like(render_depth) loss_depth_raw = render_rays.render_loss(render_depth, gt_depth, loss="L1", normalise=False) loss_depth = torch.mul(loss_depth_raw, mask_depth & mask_obj) # keep dim but set invalid element be zero # loss_all += loss_depth loss_depth = render_rays.reduce_batch_loss(loss_depth, var=var, avg=True, mask=mask_depth & mask_obj) # apply var as imap # 2D color loss: only on obj mask # [mask_obj] loss_col_raw = render_rays.render_loss(render_color, gt_color, loss="L1", normalise=False) loss_col = torch.mul(loss_col_raw.sum(-1), mask_obj) # loss_all += loss_col / 3. * color_scaling loss_col = render_rays.reduce_batch_loss(loss_col, var=None, avg=True, mask=mask_obj) # 2D occupancy/opacity loss: apply except unknown area # [mask_sem] # loss_opacity_raw = F.mse_loss(torch.clamp(render_opacity, 0, 1), mask_obj.float().detach()) # encourage other_obj to be empty, while this_obj to be solid # print("opacity max ", torch.max(render_opacity.max())) # print("opacity min ", torch.max(render_opacity.min())) loss_opacity_raw = render_rays.render_loss(render_opacity, mask_obj.float(), loss="L1", normalise=False) loss_opacity = torch.mul(loss_opacity_raw, mask_sem) # but ignore -1 unkown area e.g., mask edges # loss_all += loss_opacity * opacity_scaling loss_opacity = render_rays.reduce_batch_loss(loss_opacity, var=None, avg=True, mask=mask_sem) # todo var # loss for bp l_batch = loss_depth + loss_col * color_scaling + loss_opacity * opacity_scaling loss = l_batch.sum() return loss, None # return loss, loss_all.detach() ================================================ FILE: metric/eval_3D_obj.py ================================================ import numpy as np from tqdm import tqdm import trimesh from metrics import accuracy, completion, completion_ratio import os import json def calc_3d_metric(mesh_rec, mesh_gt, N=200000): """ 3D reconstruction metric. """ metrics = [[] for _ in range(4)] transform, extents = trimesh.bounds.oriented_bounds(mesh_gt) extents = extents / 0.9 # enlarge 0.9 box = trimesh.creation.box(extents=extents, transform=np.linalg.inv(transform)) mesh_rec = mesh_rec.slice_plane(box.facets_origin, -box.facets_normal) if mesh_rec.vertices.shape[0] == 0: print("no mesh found") return rec_pc = trimesh.sample.sample_surface(mesh_rec, N) rec_pc_tri = trimesh.PointCloud(vertices=rec_pc[0]) gt_pc = trimesh.sample.sample_surface(mesh_gt, N) gt_pc_tri = trimesh.PointCloud(vertices=gt_pc[0]) accuracy_rec = accuracy(gt_pc_tri.vertices, rec_pc_tri.vertices) completion_rec = completion(gt_pc_tri.vertices, rec_pc_tri.vertices) completion_ratio_rec = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.05) completion_ratio_rec_1 = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.01) # accuracy_rec *= 100 # convert to cm # completion_rec *= 100 # convert to cm # completion_ratio_rec *= 100 # convert to % # print('accuracy: ', accuracy_rec) # print('completion: ', completion_rec) # print('completion ratio: ', completion_ratio_rec) # print("completion_ratio_rec_1cm ", completion_ratio_rec_1) metrics[0].append(accuracy_rec) metrics[1].append(completion_rec) metrics[2].append(completion_ratio_rec_1) metrics[3].append(completion_ratio_rec) return metrics def get_gt_bg_mesh(gt_dir, background_cls_list): with open(os.path.join(gt_dir, "info_semantic.json")) as f: label_obj_list = json.load(f)["objects"] bg_meshes = [] for obj in label_obj_list: if int(obj["class_id"]) in background_cls_list: obj_file = os.path.join(gt_dir, "mesh_semantic.ply_" + str(int(obj["id"])) + ".ply") obj_mesh = trimesh.load(obj_file) bg_meshes.append(obj_mesh) bg_mesh = trimesh.util.concatenate(bg_meshes) return bg_mesh def get_obj_ids(obj_dir): files = os.listdir(obj_dir) obj_ids = [] for f in files: obj_id = f.split("obj")[1][:-1] if obj_id == '': continue obj_ids.append(int(obj_id)) return obj_ids if __name__ == "__main__": background_cls_list = [5, 12, 30, 31, 40, 60, 92, 93, 95, 97, 98, 79] exp_name = ["room0", "room1", "room2", "office0", "office1", "office2", "office3", "office4"] data_dir = "/home/xin/data/vmap/" log_dir = "../logs/iMAP/" # log_dir = "../logs/vMAP/" for exp in tqdm(exp_name): gt_dir = os.path.join(data_dir, exp[:-1]+"_"+exp[-1]+"/habitat") exp_dir = os.path.join(log_dir, exp) mesh_dir = os.path.join(exp_dir, "scene_mesh") output_path = os.path.join(exp_dir, "eval_mesh") os.makedirs(output_path, exist_ok=True) metrics_3D = [[] for _ in range(4)] # get obj ids # obj_ids = np.loadtxt() # todo use a pre-defined obj list or use vMAP results obj_ids = get_obj_ids(mesh_dir.replace("iMAP", "vMAP")) for obj_id in tqdm(obj_ids): if obj_id == 0: # for bg N = 200000 mesh_gt = get_gt_bg_mesh(gt_dir, background_cls_list) else: # for obj N = 10000 obj_file = os.path.join(gt_dir, "mesh_semantic.ply_" + str(obj_id) + ".ply") mesh_gt = trimesh.load(obj_file) if "vMAP" in exp_dir: rec_meshfile = os.path.join(mesh_dir, "frame_1999_obj"+str(obj_id)+".obj") elif "iMAP" in exp_dir: rec_meshfile = os.path.join(mesh_dir, "frame_1999_obj0.obj") else: print("Not Implement") exit(-1) mesh_rec = trimesh.load(rec_meshfile) # mesh_rec.invert() # niceslam mesh face needs invert metrics = calc_3d_metric(mesh_rec, mesh_gt, N=N) # for objs use 10k, for scene use 200k points if metrics is None: continue np.save(output_path + '/metric_obj{}.npy'.format(obj_id), np.array(metrics)) metrics_3D[0].append(metrics[0]) # acc metrics_3D[1].append(metrics[1]) # comp metrics_3D[2].append(metrics[2]) # comp ratio 1cm metrics_3D[3].append(metrics[3]) # comp ratio 5cm metrics_3D = np.array(metrics_3D) np.save(output_path + '/metrics_3D_obj.npy', metrics_3D) print("metrics 3D obj \n Acc | Comp | Comp Ratio 1cm | Comp Ratio 5cm \n", metrics_3D.mean(axis=1)) print("-----------------------------------------") print("finish exp ", exp) ================================================ FILE: metric/eval_3D_scene.py ================================================ import numpy as np from tqdm import tqdm import trimesh from metrics import accuracy, completion, completion_ratio import os def calc_3d_metric(mesh_rec, mesh_gt, N=200000): """ 3D reconstruction metric. """ metrics = [[] for _ in range(4)] rec_pc = trimesh.sample.sample_surface(mesh_rec, N) rec_pc_tri = trimesh.PointCloud(vertices=rec_pc[0]) gt_pc = trimesh.sample.sample_surface(mesh_gt, N) gt_pc_tri = trimesh.PointCloud(vertices=gt_pc[0]) accuracy_rec = accuracy(gt_pc_tri.vertices, rec_pc_tri.vertices) completion_rec = completion(gt_pc_tri.vertices, rec_pc_tri.vertices) completion_ratio_rec = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.05) completion_ratio_rec_1 = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.01) # accuracy_rec *= 100 # convert to cm # completion_rec *= 100 # convert to cm # completion_ratio_rec *= 100 # convert to % # print('accuracy: ', accuracy_rec) # print('completion: ', completion_rec) # print('completion ratio: ', completion_ratio_rec) # print("completion_ratio_rec_1cm ", completion_ratio_rec_1) metrics[0].append(accuracy_rec) metrics[1].append(completion_rec) metrics[2].append(completion_ratio_rec_1) metrics[3].append(completion_ratio_rec) return metrics if __name__ == "__main__": exp_name = ["room0", "room1", "room2", "office0", "office1", "office2", "office3", "office4"] data_dir = "/home/xin/data/vmap/" # log_dir = "../logs/iMAP/" log_dir = "../logs/vMAP/" for exp in tqdm(exp_name): gt_dir = os.path.join(data_dir, exp[:-1]+"_"+exp[-1]+"/habitat") exp_dir = os.path.join(log_dir, exp) mesh_dir = os.path.join(exp_dir, "scene_mesh") output_path = os.path.join(exp_dir, "eval_mesh") os.makedirs(output_path, exist_ok=True) if "vMAP" in exp_dir: mesh_list = os.listdir(mesh_dir) if "frame_1999_scene.obj" in mesh_list: rec_meshfile = os.path.join(mesh_dir, "frame_1999_scene.obj") else: # compose obj into scene mesh scene_meshes = [] for f in mesh_list: _, f_type = os.path.splitext(f) if f_type == ".obj" or f_type == ".ply": obj_mesh = trimesh.load(os.path.join(mesh_dir, f)) scene_meshes.append(obj_mesh) scene_mesh = trimesh.util.concatenate(scene_meshes) scene_mesh.export(os.path.join(mesh_dir, "frame_1999_scene.obj")) rec_meshfile = os.path.join(mesh_dir, "frame_1999_scene.obj") elif "iMAP" in exp_dir: # obj0 is the scene mesh rec_meshfile = os.path.join(mesh_dir, "frame_1999_obj0.obj") else: print("Not Implement") exit(-1) gt_mesh_files = os.listdir(gt_dir) gt_mesh_file = os.path.join(gt_dir, "../mesh.ply") mesh_rec = trimesh.load(rec_meshfile) # mesh_rec.invert() # niceslam mesh face needs invert metrics_3D = [[] for _ in range(4)] mesh_gt = trimesh.load(gt_mesh_file) metrics = calc_3d_metric(mesh_rec, mesh_gt, N=200000) # for objs use 10k, for scene use 200k points metrics_3D[0].append(metrics[0]) # acc metrics_3D[1].append(metrics[1]) # comp metrics_3D[2].append(metrics[2]) # comp ratio 1cm metrics_3D[3].append(metrics[3]) # comp ratio 5cm metrics_3D = np.array(metrics_3D) np.save(output_path + '/metrics_3D_scene.npy', metrics_3D) print("metrics 3D scene \n Acc | Comp | Comp Ratio 1cm | Comp Ratio 5cm \n ", metrics_3D.mean(axis=1)) print("-----------------------------------------") print("finish exp ", exp) ================================================ FILE: metric/metrics.py ================================================ import numpy as np from scipy.spatial import cKDTree as KDTree def completion_ratio(gt_points, rec_points, dist_th=0.01): gen_points_kd_tree = KDTree(rec_points) one_distances, one_vertex_ids = gen_points_kd_tree.query(gt_points) completion = np.mean((one_distances < dist_th).astype(np.float)) return completion def accuracy(gt_points, rec_points): gt_points_kd_tree = KDTree(gt_points) two_distances, two_vertex_ids = gt_points_kd_tree.query(rec_points) gen_to_gt_chamfer = np.mean(two_distances) return gen_to_gt_chamfer def completion(gt_points, rec_points): gt_points_kd_tree = KDTree(rec_points) one_distances, two_vertex_ids = gt_points_kd_tree.query(gt_points) gt_to_gen_chamfer = np.mean(one_distances) return gt_to_gen_chamfer def chamfer(gt_points, rec_points): # one direction gen_points_kd_tree = KDTree(rec_points) one_distances, one_vertex_ids = gen_points_kd_tree.query(gt_points) gt_to_gen_chamfer = np.mean(one_distances) # other direction gt_points_kd_tree = KDTree(gt_points) two_distances, two_vertex_ids = gt_points_kd_tree.query(rec_points) gen_to_gt_chamfer = np.mean(two_distances) return (gt_to_gen_chamfer + gen_to_gt_chamfer) / 2. ================================================ FILE: model.py ================================================ import torch import torch.nn as nn def init_weights(m, init_fn=torch.nn.init.xavier_normal_): if type(m) == torch.nn.Linear: init_fn(m.weight) def fc_block(in_f, out_f): return torch.nn.Sequential( torch.nn.Linear(in_f, out_f), torch.nn.ReLU(out_f) ) class OccupancyMap(torch.nn.Module): def __init__( self, emb_size1, emb_size2, hidden_size=256, do_color=True, hidden_layers_block=1 ): super(OccupancyMap, self).__init__() self.do_color = do_color self.embedding_size1 = emb_size1 self.in_layer = fc_block(self.embedding_size1, hidden_size) hidden1 = [fc_block(hidden_size, hidden_size) for _ in range(hidden_layers_block)] self.mid1 = torch.nn.Sequential(*hidden1) # self.embedding_size2 = 21*(5+1)+3 - self.embedding_size # 129-66=63 32 self.embedding_size2 = emb_size2 self.cat_layer = fc_block( hidden_size + self.embedding_size1, hidden_size) # self.cat_layer = fc_block( # hidden_size , hidden_size) hidden2 = [fc_block(hidden_size, hidden_size) for _ in range(hidden_layers_block)] self.mid2 = torch.nn.Sequential(*hidden2) self.out_alpha = torch.nn.Linear(hidden_size, 1) if self.do_color: self.color_linear = fc_block(self.embedding_size2 + hidden_size, hidden_size) self.out_color = torch.nn.Linear(hidden_size, 3) # self.relu = torch.nn.functional.relu self.sigmoid = torch.sigmoid def forward(self, x, noise_std=None, do_alpha=True, do_color=True, do_cat=True): fc1 = self.in_layer(x[...,:self.embedding_size1]) fc2 = self.mid1(fc1) # fc3 = self.cat_layer(fc2) if do_cat: fc2_x = torch.cat((fc2, x[...,:self.embedding_size1]), dim=-1) fc3 = self.cat_layer(fc2_x) else: fc3 = fc2 fc4 = self.mid2(fc3) alpha = None if do_alpha: raw = self.out_alpha(fc4) # todo ignore noise if noise_std is not None: noise = torch.randn(raw.shape, device=x.device) * noise_std raw = raw + noise # alpha = self.relu(raw) * scale # nerf alpha = raw * 10. #self.scale # unisurf color = None if self.do_color and do_color: fc4_cat = self.color_linear(torch.cat((fc4, x[..., self.embedding_size1:]), dim=-1)) raw_color = self.out_color(fc4_cat) color = self.sigmoid(raw_color) return alpha, color ================================================ FILE: render_rays.py ================================================ import torch import numpy as np def occupancy_activation(alpha, distances=None): # occ = 1.0 - torch.exp(-alpha * distances) occ = torch.sigmoid(alpha) # unisurf return occ def alpha_to_occupancy(depths, dirs, alpha, add_last=False): interval_distances = depths[..., 1:] - depths[..., :-1] if add_last: last_distance = torch.empty( (depths.shape[0], 1), device=depths.device, dtype=depths.dtype).fill_(0.1) interval_distances = torch.cat( [interval_distances, last_distance], dim=-1) dirs_norm = torch.norm(dirs, dim=-1) interval_distances = interval_distances * dirs_norm[:, None] occ = occupancy_activation(alpha, interval_distances) return occ def occupancy_to_termination(occupancy, is_batch=False): if is_batch: first = torch.ones(list(occupancy.shape[:2]) + [1], device=occupancy.device) free_probs = (1. - occupancy + 1e-10)[:, :, :-1] else: first = torch.ones([occupancy.shape[0], 1], device=occupancy.device) free_probs = (1. - occupancy + 1e-10)[:, :-1] free_probs = torch.cat([first, free_probs], dim=-1) term_probs = occupancy * torch.cumprod(free_probs, dim=-1) # using escape probability # occupancy = occupancy[:, :-1] # first = torch.ones([occupancy.shape[0], 1], device=occupancy.device) # free_probs = (1. - occupancy + 1e-10) # free_probs = torch.cat([first, free_probs], dim=-1) # last = torch.ones([occupancy.shape[0], 1], device=occupancy.device) # occupancy = torch.cat([occupancy, last], dim=-1) # term_probs = occupancy * torch.cumprod(free_probs, dim=-1) return term_probs def render(termination, vals, dim=-1): weighted_vals = termination * vals render = weighted_vals.sum(dim=dim) return render def render_loss(render, gt, loss="L1", normalise=False): residual = render - gt if loss == "L2": loss_mat = residual ** 2 elif loss == "L1": loss_mat = torch.abs(residual) else: print("loss type {} not implemented!".format(loss)) if normalise: loss_mat = loss_mat / gt return loss_mat def reduce_batch_loss(loss_mat, var=None, avg=True, mask=None, loss_type="L1"): mask_num = torch.sum(mask, dim=-1) if (mask_num == 0).any(): # no valid sample, return 0 loss loss = torch.zeros_like(loss_mat) if avg: loss = torch.mean(loss, dim=-1) return loss if var is not None: eps = 1e-4 if loss_type == "L2": information = 1.0 / (var + eps) elif loss_type == "L1": information = 1.0 / (torch.sqrt(var) + eps) loss_weighted = loss_mat * information else: loss_weighted = loss_mat if avg: if mask is not None: loss = (torch.sum(loss_weighted, dim=-1)/(torch.sum(mask, dim=-1)+1e-10)) if (loss > 100000).any(): print("loss explode") exit(-1) else: loss = torch.mean(loss_weighted, dim=-1).sum() else: loss = loss_weighted return loss def make_3D_grid(occ_range=[-1., 1.], dim=256, device="cuda:0", transform=None, scale=None): t = torch.linspace(occ_range[0], occ_range[1], steps=dim, device=device) grid = torch.meshgrid(t, t, t) grid_3d = torch.cat( (grid[0][..., None], grid[1][..., None], grid[2][..., None]), dim=3 ) if scale is not None: grid_3d = grid_3d * scale if transform is not None: R1 = transform[None, None, None, 0, :3] R2 = transform[None, None, None, 1, :3] R3 = transform[None, None, None, 2, :3] grid1 = (R1 * grid_3d).sum(-1, keepdim=True) grid2 = (R2 * grid_3d).sum(-1, keepdim=True) grid3 = (R3 * grid_3d).sum(-1, keepdim=True) grid_3d = torch.cat([grid1, grid2, grid3], dim=-1) trans = transform[None, None, None, :3, 3] grid_3d = grid_3d + trans return grid_3d ================================================ FILE: train.py ================================================ import time import loss from vmap import * import utils import open3d import dataset import vis from functorch import vmap import argparse from cfg import Config import shutil if __name__ == "__main__": ############################################# # init config torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True # setting params parser = argparse.ArgumentParser(description='Model training for single GPU') parser.add_argument('--logdir', default='./logs/debug', type=str) parser.add_argument('--config', default='./configs/Replica/config_replica_room0_vMAP.json', type=str) parser.add_argument('--save_ckpt', default=False, type=bool) args = parser.parse_args() log_dir = args.logdir config_file = args.config save_ckpt = args.save_ckpt os.makedirs(log_dir, exist_ok=True) # saving logs shutil.copy(config_file, log_dir) cfg = Config(config_file) # config params n_sample_per_step = cfg.n_per_optim n_sample_per_step_bg = cfg.n_per_optim_bg # param for vis vis3d = open3d.visualization.Visualizer() vis3d.create_window(window_name="3D mesh vis", width=cfg.W, height=cfg.H, left=600, top=50) view_ctl = vis3d.get_view_control() view_ctl.set_constant_z_far(10.) # set camera cam_info = cameraInfo(cfg) intrinsic_open3d = open3d.camera.PinholeCameraIntrinsic( width=cfg.W, height=cfg.H, fx=cfg.fx, fy=cfg.fy, cx=cfg.cx, cy=cfg.cy) # init obj_dict obj_dict = {} # only objs vis_dict = {} # including bg # init for training AMP = False if AMP: scaler = torch.cuda.amp.GradScaler() # amp https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/ optimiser = torch.optim.AdamW([torch.autograd.Variable(torch.tensor(0))], lr=cfg.learning_rate, weight_decay=cfg.weight_decay) ############################################# # init data stream if not cfg.live_mode: # load dataset dataloader = dataset.init_loader(cfg) dataloader_iterator = iter(dataloader) dataset_len = len(dataloader) else: dataset_len = 1000000 # # init ros node # torch.multiprocessing.set_start_method('spawn') # spawn # import ros_nodes # track_to_map_Buffer = torch.multiprocessing.Queue(maxsize=5) # # track_to_vis_T_WC = torch.multiprocessing.Queue(maxsize=1) # kfs_que = torch.multiprocessing.Queue(maxsize=5) # to store one more buffer # track_p = torch.multiprocessing.Process(target=ros_nodes.Tracking, # args=( # (cfg), (track_to_map_Buffer), (None), # (kfs_que), (True),)) # track_p.start() # init vmap fc_models, pe_models = [], [] scene_bg = None for frame_id in tqdm(range(dataset_len)): print("*********************************************") # get new frame data with performance_measure(f"getting next data"): if not cfg.live_mode: # get data from dataloader sample = next(dataloader_iterator) else: pass if sample is not None: # new frame last_frame_time = time.time() with performance_measure(f"Appending data"): rgb = sample["image"].to(cfg.data_device) depth = sample["depth"].to(cfg.data_device) twc = sample["T"].to(cfg.data_device) bbox_dict = sample["bbox_dict"] if "frame_id" in sample.keys(): live_frame_id = sample["frame_id"] else: live_frame_id = frame_id if not cfg.live_mode: inst = sample["obj"].to(cfg.data_device) obj_ids = torch.unique(inst) else: inst_data_dict = sample["obj"] obj_ids = inst_data_dict.keys() # append new frame info to objs in current view for obj_id in obj_ids: if obj_id == -1: # unsured area continue obj_id = int(obj_id) # convert inst mask to state if not cfg.live_mode: state = torch.zeros_like(inst, dtype=torch.uint8, device=cfg.data_device) state[inst == obj_id] = 1 state[inst == -1] = 2 else: inst_mask = inst_data_dict[obj_id].permute(1,0) label_list = torch.unique(inst_mask).tolist() state = torch.zeros_like(inst_mask, dtype=torch.uint8, device=cfg.data_device) state[inst_mask == obj_id] = 1 state[inst_mask == -1] = 2 bbox = bbox_dict[obj_id] if obj_id in vis_dict.keys(): scene_obj = vis_dict[obj_id] scene_obj.append_keyframe(rgb, depth, state, bbox, twc, live_frame_id) else: # init scene_obj if len(obj_dict.keys()) >= cfg.max_n_models: print("models full!!!! current num ", len(obj_dict.keys())) continue print("init new obj ", obj_id) if cfg.do_bg and obj_id == 0: # todo param scene_bg = sceneObject(cfg, obj_id, rgb, depth, state, bbox, twc, live_frame_id) # scene_bg.init_obj_center(intrinsic_open3d, depth, state, twc) optimiser.add_param_group({"params": scene_bg.trainer.fc_occ_map.parameters(), "lr": cfg.learning_rate, "weight_decay": cfg.weight_decay}) optimiser.add_param_group({"params": scene_bg.trainer.pe.parameters(), "lr": cfg.learning_rate, "weight_decay": cfg.weight_decay}) vis_dict.update({obj_id: scene_bg}) else: scene_obj = sceneObject(cfg, obj_id, rgb, depth, state, bbox, twc, live_frame_id) # scene_obj.init_obj_center(intrinsic_open3d, depth, state, twc) obj_dict.update({obj_id: scene_obj}) vis_dict.update({obj_id: scene_obj}) # params = [scene_obj.trainer.fc_occ_map.parameters(), scene_obj.trainer.pe.parameters()] optimiser.add_param_group({"params": scene_obj.trainer.fc_occ_map.parameters(), "lr": cfg.learning_rate, "weight_decay": cfg.weight_decay}) optimiser.add_param_group({"params": scene_obj.trainer.pe.parameters(), "lr": cfg.learning_rate, "weight_decay": cfg.weight_decay}) if cfg.training_strategy == "vmap": update_vmap_model = True fc_models.append(obj_dict[obj_id].trainer.fc_occ_map) pe_models.append(obj_dict[obj_id].trainer.pe) # ################################### # # measure trainable params in total # total_params = 0 # obj_k = obj_dict[obj_id] # for p in obj_k.trainer.fc_occ_map.parameters(): # if p.requires_grad: # total_params += p.numel() # for p in obj_k.trainer.pe.parameters(): # if p.requires_grad: # total_params += p.numel() # print("total param ", total_params) # dynamically add vmap with performance_measure(f"add vmap"): if cfg.training_strategy == "vmap" and update_vmap_model == True: fc_model, fc_param, fc_buffer = utils.update_vmap(fc_models, optimiser) pe_model, pe_param, pe_buffer = utils.update_vmap(pe_models, optimiser) update_vmap_model = False ################################################################## # training data preperation, get training data for all objs Batch_N_gt_depth = [] Batch_N_gt_rgb = [] Batch_N_depth_mask = [] Batch_N_obj_mask = [] Batch_N_input_pcs = [] Batch_N_sampled_z = [] with performance_measure(f"Sampling over {len(obj_dict.keys())} objects,"): if cfg.do_bg and scene_bg is not None: gt_rgb, gt_depth, valid_depth_mask, obj_mask, input_pcs, sampled_z \ = scene_bg.get_training_samples(cfg.n_iter_per_frame * cfg.win_size_bg, cfg.n_samples_per_frame_bg, cam_info.rays_dir_cache) bg_gt_depth = gt_depth.reshape([gt_depth.shape[0] * gt_depth.shape[1]]) bg_gt_rgb = gt_rgb.reshape([gt_rgb.shape[0] * gt_rgb.shape[1], gt_rgb.shape[2]]) bg_valid_depth_mask = valid_depth_mask bg_obj_mask = obj_mask bg_input_pcs = input_pcs.reshape( [input_pcs.shape[0] * input_pcs.shape[1], input_pcs.shape[2], input_pcs.shape[3]]) bg_sampled_z = sampled_z.reshape([sampled_z.shape[0] * sampled_z.shape[1], sampled_z.shape[2]]) for obj_id, obj_k in obj_dict.items(): gt_rgb, gt_depth, valid_depth_mask, obj_mask, input_pcs, sampled_z \ = obj_k.get_training_samples(cfg.n_iter_per_frame * cfg.win_size, cfg.n_samples_per_frame, cam_info.rays_dir_cache) # merge first two dims, sample_per_frame*num_per_frame Batch_N_gt_depth.append(gt_depth.reshape([gt_depth.shape[0] * gt_depth.shape[1]])) Batch_N_gt_rgb.append(gt_rgb.reshape([gt_rgb.shape[0] * gt_rgb.shape[1], gt_rgb.shape[2]])) Batch_N_depth_mask.append(valid_depth_mask) Batch_N_obj_mask.append(obj_mask) Batch_N_input_pcs.append(input_pcs.reshape([input_pcs.shape[0] * input_pcs.shape[1], input_pcs.shape[2], input_pcs.shape[3]])) Batch_N_sampled_z.append(sampled_z.reshape([sampled_z.shape[0] * sampled_z.shape[1], sampled_z.shape[2]])) # # vis sampled points in open3D # # sampled pcs # pc = open3d.geometry.PointCloud() # pc.points = open3d.utility.Vector3dVector(input_pcs.cpu().numpy().reshape(-1,3)) # open3d.visualization.draw_geometries([pc]) # rgb_np = rgb.cpu().numpy().astype(np.uint8).transpose(1,0,2) # # print("rgb ", rgb_np.shape) # # print(rgb_np) # # cv2.imshow("rgb", rgb_np) # # cv2.waitKey(1) # depth_np = depth.cpu().numpy().astype(np.float32).transpose(1,0) # twc_np = twc.cpu().numpy() # rgbd = open3d.geometry.RGBDImage.create_from_color_and_depth( # open3d.geometry.Image(rgb_np), # open3d.geometry.Image(depth_np), # depth_trunc=max_depth, # depth_scale=1, # convert_rgb_to_intensity=False, # ) # T_CW = np.linalg.inv(twc_np) # # input image pc # input_pc = open3d.geometry.PointCloud.create_from_rgbd_image( # image=rgbd, # intrinsic=intrinsic_open3d, # extrinsic=T_CW) # input_pc.points = open3d.utility.Vector3dVector(np.array(input_pc.points) - obj_k.obj_center.cpu().numpy()) # open3d.visualization.draw_geometries([pc, input_pc]) #################################################### # training assert len(Batch_N_input_pcs) > 0 # move data to GPU (n_obj, n_iter_per_frame, win_size*num_per_frame, 3) with performance_measure(f"stacking and moving to gpu: "): Batch_N_input_pcs = torch.stack(Batch_N_input_pcs).to(cfg.training_device) Batch_N_gt_depth = torch.stack(Batch_N_gt_depth).to(cfg.training_device) Batch_N_gt_rgb = torch.stack(Batch_N_gt_rgb).to(cfg.training_device) / 255. # todo Batch_N_depth_mask = torch.stack(Batch_N_depth_mask).to(cfg.training_device) Batch_N_obj_mask = torch.stack(Batch_N_obj_mask).to(cfg.training_device) Batch_N_sampled_z = torch.stack(Batch_N_sampled_z).to(cfg.training_device) if cfg.do_bg: bg_input_pcs = bg_input_pcs.to(cfg.training_device) bg_gt_depth = bg_gt_depth.to(cfg.training_device) bg_gt_rgb = bg_gt_rgb.to(cfg.training_device) / 255. bg_valid_depth_mask = bg_valid_depth_mask.to(cfg.training_device) bg_obj_mask = bg_obj_mask.to(cfg.training_device) bg_sampled_z = bg_sampled_z.to(cfg.training_device) with performance_measure(f"Training over {len(obj_dict.keys())} objects,"): for iter_step in range(cfg.n_iter_per_frame): data_idx = slice(iter_step*n_sample_per_step, (iter_step+1)*n_sample_per_step) batch_input_pcs = Batch_N_input_pcs[:, data_idx, ...] batch_gt_depth = Batch_N_gt_depth[:, data_idx, ...] batch_gt_rgb = Batch_N_gt_rgb[:, data_idx, ...] batch_depth_mask = Batch_N_depth_mask[:, data_idx, ...] batch_obj_mask = Batch_N_obj_mask[:, data_idx, ...] batch_sampled_z = Batch_N_sampled_z[:, data_idx, ...] if cfg.training_strategy == "forloop": # for loop training batch_alpha = [] batch_color = [] for k, obj_id in enumerate(obj_dict.keys()): obj_k = obj_dict[obj_id] embedding_k = obj_k.trainer.pe(batch_input_pcs[k]) alpha_k, color_k = obj_k.trainer.fc_occ_map(embedding_k) batch_alpha.append(alpha_k) batch_color.append(color_k) batch_alpha = torch.stack(batch_alpha) batch_color = torch.stack(batch_color) elif cfg.training_strategy == "vmap": # batched training batch_embedding = vmap(pe_model)(pe_param, pe_buffer, batch_input_pcs) batch_alpha, batch_color = vmap(fc_model)(fc_param, fc_buffer, batch_embedding) # print("batch alpha ", batch_alpha.shape) else: print("training strategy {} is not implemented ".format(cfg.training_strategy)) exit(-1) # step loss # with performance_measure(f"Batch LOSS"): batch_loss, _ = loss.step_batch_loss(batch_alpha, batch_color, batch_gt_depth.detach(), batch_gt_rgb.detach(), batch_obj_mask.detach(), batch_depth_mask.detach(), batch_sampled_z.detach()) if cfg.do_bg: bg_data_idx = slice(iter_step * n_sample_per_step_bg, (iter_step + 1) * n_sample_per_step_bg) bg_embedding = scene_bg.trainer.pe(bg_input_pcs[bg_data_idx, ...]) bg_alpha, bg_color = scene_bg.trainer.fc_occ_map(bg_embedding) bg_loss, _ = loss.step_batch_loss(bg_alpha[None, ...], bg_color[None, ...], bg_gt_depth[None, bg_data_idx, ...].detach(), bg_gt_rgb[None, bg_data_idx].detach(), bg_obj_mask[None, bg_data_idx, ...].detach(), bg_valid_depth_mask[None, bg_data_idx, ...].detach(), bg_sampled_z[None, bg_data_idx, ...].detach()) batch_loss += bg_loss # with performance_measure(f"Backward"): if AMP: scaler.scale(batch_loss).backward() scaler.step(optimiser) scaler.update() else: batch_loss.backward() optimiser.step() optimiser.zero_grad(set_to_none=True) # print("loss ", batch_loss.item()) # update each origin model params # todo find a better way # https://github.com/pytorch/functorch/issues/280 with performance_measure(f"updating vmap param"): if cfg.training_strategy == "vmap": with torch.no_grad(): for model_id, (obj_id, obj_k) in enumerate(obj_dict.items()): for i, param in enumerate(obj_k.trainer.fc_occ_map.parameters()): param.copy_(fc_param[i][model_id]) for i, param in enumerate(obj_k.trainer.pe.parameters()): param.copy_(pe_param[i][model_id]) #################################################################### # live vis mesh if (((frame_id % cfg.n_vis_iter) == 0 or frame_id == dataset_len-1) or (cfg.live_mode and time.time()-last_frame_time>cfg.keep_live_time)) and frame_id >= 10: vis3d.clear_geometries() for obj_id, obj_k in vis_dict.items(): bound = obj_k.get_bound(intrinsic_open3d) if bound is None: print("get bound failed obj ", obj_id) continue adaptive_grid_dim = int(np.minimum(np.max(bound.extent)//cfg.live_voxel_size+1, cfg.grid_dim)) mesh = obj_k.trainer.meshing(bound, obj_k.obj_center, grid_dim=adaptive_grid_dim) if mesh is None: print("meshing failed obj ", obj_id) continue # save to dir obj_mesh_output = os.path.join(log_dir, "scene_mesh") os.makedirs(obj_mesh_output, exist_ok=True) mesh.export(os.path.join(obj_mesh_output, "frame_{}_obj{}.obj".format(frame_id, str(obj_id)))) # live vis open3d_mesh = vis.trimesh_to_open3d(mesh) vis3d.add_geometry(open3d_mesh) vis3d.add_geometry(bound) # update vis3d vis3d.poll_events() vis3d.update_renderer() if False: # follow cam cam = view_ctl.convert_to_pinhole_camera_parameters() T_CW_np = np.linalg.inv(twc.cpu().numpy()) cam.extrinsic = T_CW_np view_ctl.convert_from_pinhole_camera_parameters(cam) vis3d.poll_events() vis3d.update_renderer() with performance_measure("saving ckpt"): if save_ckpt and ((((frame_id % cfg.n_vis_iter) == 0 or frame_id == dataset_len - 1) or (cfg.live_mode and time.time() - last_frame_time > cfg.keep_live_time)) and frame_id >= 10): for obj_id, obj_k in vis_dict.items(): ckpt_dir = os.path.join(log_dir, "ckpt", str(obj_id)) os.makedirs(ckpt_dir, exist_ok=True) bound = obj_k.get_bound(intrinsic_open3d) # update bound obj_k.save_checkpoints(ckpt_dir, frame_id) # save current cam pose cam_dir = os.path.join(log_dir, "cam_pose") os.makedirs(cam_dir, exist_ok=True) torch.save({"twc": twc,}, os.path.join(cam_dir, "twc_frame_{}".format(frame_id) + ".pth")) ================================================ FILE: trainer.py ================================================ import torch import model import embedding import render_rays import numpy as np import vis from tqdm import tqdm class Trainer: def __init__(self, cfg): self.obj_id = cfg.obj_id self.device = cfg.training_device self.hidden_feature_size = cfg.hidden_feature_size #32 for obj # 256 for iMAP, 128 for seperate bg self.obj_scale = cfg.obj_scale # 10 for bg and iMAP self.n_unidir_funcs = cfg.n_unidir_funcs self.emb_size1 = 21*(3+1)+3 self.emb_size2 = 21*(5+1)+3 - self.emb_size1 self.load_network() if self.obj_id == 0: self.bound_extent = 0.995 else: self.bound_extent = 0.9 def load_network(self): self.fc_occ_map = model.OccupancyMap( self.emb_size1, self.emb_size2, hidden_size=self.hidden_feature_size ) self.fc_occ_map.apply(model.init_weights).to(self.device) self.pe = embedding.UniDirsEmbed(max_deg=self.n_unidir_funcs, scale=self.obj_scale).to(self.device) def meshing(self, bound, obj_center, grid_dim=256): occ_range = [-1., 1.] range_dist = occ_range[1] - occ_range[0] scene_scale_np = bound.extent / (range_dist * self.bound_extent) scene_scale = torch.from_numpy(scene_scale_np).float().to(self.device) transform_np = np.eye(4, dtype=np.float32) transform_np[:3, 3] = bound.center transform_np[:3, :3] = bound.R # transform_np = np.linalg.inv(transform_np) # transform = torch.from_numpy(transform_np).to(self.device) grid_pc = render_rays.make_3D_grid(occ_range=occ_range, dim=grid_dim, device=self.device, scale=scene_scale, transform=transform).view(-1, 3) grid_pc -= obj_center.to(grid_pc.device) ret = self.eval_points(grid_pc) if ret is None: return None occ, _ = ret mesh = vis.marching_cubes(occ.view(grid_dim, grid_dim, grid_dim).cpu().numpy()) if mesh is None: print("marching cube failed") return None # Transform to [-1, 1] range mesh.apply_translation([-0.5, -0.5, -0.5]) mesh.apply_scale(2) # Transform to scene coordinates mesh.apply_scale(scene_scale_np) mesh.apply_transform(transform_np) vertices_pts = torch.from_numpy(np.array(mesh.vertices)).float().to(self.device) ret = self.eval_points(vertices_pts) if ret is None: return None _, color = ret mesh_color = color * 255 vertex_colors = mesh_color.detach().squeeze(0).cpu().numpy().astype(np.uint8) mesh.visual.vertex_colors = vertex_colors return mesh def eval_points(self, points, chunk_size=100000): # 256^3 = 16777216 alpha, color = [], [] n_chunks = int(np.ceil(points.shape[0] / chunk_size)) with torch.no_grad(): for k in tqdm(range(n_chunks)): # 2s/it 1000000 pts chunk_idx = slice(k * chunk_size, (k + 1) * chunk_size) embedding_k = self.pe(points[chunk_idx, ...]) alpha_k, color_k = self.fc_occ_map(embedding_k) alpha.extend(alpha_k.detach().squeeze()) color.extend(color_k.detach().squeeze()) alpha = torch.stack(alpha) color = torch.stack(color) occ = render_rays.occupancy_activation(alpha).detach() if occ.max() == 0: print("no occ") return None return (occ, color) ================================================ FILE: utils.py ================================================ import cv2 import imgviz import numpy as np import torch from functorch import combine_state_for_ensemble import open3d import queue import copy import torch.utils.dlpack class BoundingBox(): def __init__(self): super(BoundingBox, self).__init__() self.extent = None self.R = None self.center = None self.points3d = None # (8,3) def bbox_open3d2bbox(bbox_o3d): bbox = BoundingBox() bbox.extent = bbox_o3d.extent bbox.R = bbox_o3d.R bbox.center = bbox_o3d.center return bbox def bbox_bbox2open3d(bbox): bbox_o3d = open3d.geometry.OrientedBoundingBox(bbox.center, bbox.R, bbox.extent) return bbox_o3d def update_vmap(models, optimiser): fmodel, params, buffers = combine_state_for_ensemble(models) [p.requires_grad_() for p in params] optimiser.add_param_group({"params": params}) # imap b l return (fmodel, params, buffers) def enlarge_bbox(bbox, scale, w, h): assert scale >= 0 # print(bbox) min_x, min_y, max_x, max_y = bbox margin_x = int(0.5 * scale * (max_x - min_x)) margin_y = int(0.5 * scale * (max_y - min_y)) if margin_y == 0 or margin_x == 0: return None # assert margin_x != 0 # assert margin_y != 0 min_x -= margin_x max_x += margin_x min_y -= margin_y max_y += margin_y min_x = np.clip(min_x, 0, w-1) min_y = np.clip(min_y, 0, h-1) max_x = np.clip(max_x, 0, w-1) max_y = np.clip(max_y, 0, h-1) bbox_enlarged = [int(min_x), int(min_y), int(max_x), int(max_y)] return bbox_enlarged def get_bbox2d(obj_mask, bbox_scale=1.0): contours, hierarchy = cv2.findContours(obj_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[ -2:] # # Find the index of the largest contour # areas = [cv2.contourArea(c) for c in contours] # max_index = np.argmax(areas) # cnt = contours[max_index] # Concatenate all contours if len(contours) == 0: return None cnt = np.concatenate(contours) x, y, w, h = cv2.boundingRect(cnt) # todo if multiple contours, choose the outmost one? # x, y, w, h = cv2.boundingRect(contours) bbox_enlarged = enlarge_bbox([x, y, x + w, y + h], scale=bbox_scale, w=obj_mask.shape[1], h=obj_mask.shape[0]) return bbox_enlarged def get_bbox2d_batch(img): b,h,w = img.shape[:3] rows = torch.any(img, axis=2) cols = torch.any(img, axis=1) rmins = torch.argmax(rows.float(), dim=1) rmaxs = h - torch.argmax(rows.float().flip(dims=[1]), dim=1) cmins = torch.argmax(cols.float(), dim=1) cmaxs = w - torch.argmax(cols.float().flip(dims=[1]), dim=1) return rmins, rmaxs, cmins, cmaxs def get_latest_queue(q): message = None while(True): try: message_latest = q.get(block=False) if message is not None: del message message = message_latest except queue.Empty: break return message # for association/tracking class InstData: def __init__(self): super(InstData, self).__init__() self.bbox3D = None self.inst_id = None # instance self.class_id = None # semantic self.pc_sample = None self.merge_cnt = 0 # merge times counting self.cmp_cnt = 0 def box_filter(masks, classes, depth, inst_dict, intrinsic_open3d, T_CW, min_pixels=500, voxel_size=0.01): bbox3d_scale = 1.0 # 1.05 inst_data = np.zeros_like(depth, dtype=np.int) for i in range(len(masks)): diff_mask = None inst_mask = masks[i] inst_id = classes[i] if inst_id == 0: continue inst_depth = np.copy(depth) inst_depth[~inst_mask] = 0. # inst_mask # proj_time = time.time() inst_pc = unproject_pointcloud(inst_depth, intrinsic_open3d, T_CW) # print("proj time ", time.time()-proj_time) if len(inst_pc.points) <= 10: # too small inst_data[inst_mask] = 0 # set to background continue if inst_id in inst_dict.keys(): candidate_inst = inst_dict[inst_id] # iou_time = time.time() IoU, indices = check_inside_ratio(inst_pc, candidate_inst.bbox3D) # print("iou time ", time.time()-iou_time) # if indices empty candidate_inst.cmp_cnt += 1 if len(indices) >= 1: candidate_inst.pc += inst_pc.select_by_index(indices) # only merge pcs inside scale*bbox # todo check indices follow valid depth valid_depth_mask = np.zeros_like(inst_depth, dtype=np.bool) valid_pc_mask = valid_depth_mask[inst_depth!=0] valid_pc_mask[indices] = True valid_depth_mask[inst_depth != 0] = valid_pc_mask valid_mask = valid_depth_mask diff_mask = np.zeros_like(inst_mask) # uv_opencv, _ = cv2.projectPoints(np.array(inst_pc.select_by_index(indices).points), T_CW[:3, :3], # T_CW[:3, 3], intrinsic_open3d.intrinsic_matrix[:3, :3], None) # uv = np.round(uv_opencv).squeeze().astype(int) # u = uv[:, 0].reshape(-1, 1) # v = uv[:, 1].reshape(-1, 1) # vu = np.concatenate([v, u], axis=-1) # valid_mask = np.zeros_like(inst_mask) # valid_mask[tuple(vu.T)] = True # # cv2.imshow("valid", (inst_depth!=0).astype(np.uint8)*255) # # cv2.waitKey(1) diff_mask[(inst_depth != 0) & (~valid_mask)] = True # cv2.imshow("diff_mask", diff_mask.astype(np.uint8) * 255) # cv2.waitKey(1) else: # merge all for scannet # print("too few pcs obj ", inst_id) inst_data[inst_mask] = -1 continue # downsample_time = time.time() # adapt_voxel_size = np.maximum(np.max(candidate_inst.bbox3D.extent)/100, 0.1) candidate_inst.pc = candidate_inst.pc.voxel_down_sample(voxel_size) # adapt_voxel_size # candidate_inst.pc = candidate_inst.pc.farthest_point_down_sample(500) # candidate_inst.pc = candidate_inst.pc.random_down_sample(np.minimum(len(candidate_inst.pc.points)/500.,1)) # print("downsample time ", time.time() - downsample_time) # 0.03s even # bbox_time = time.time() try: candidate_inst.bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(candidate_inst.pc.points) except RuntimeError: # print("too few pcs obj ", inst_id) inst_data[inst_mask] = -1 continue # enlarge candidate_inst.bbox3D.scale(bbox3d_scale, candidate_inst.bbox3D.get_center()) else: # new inst # init new inst and new sem new_inst = InstData() new_inst.inst_id = inst_id smaller_mask = cv2.erode(inst_mask.astype(np.uint8), np.ones((5, 5)), iterations=3).astype(bool) if np.sum(smaller_mask) < min_pixels: # print("too few pcs obj ", inst_id) inst_data[inst_mask] = 0 continue inst_depth_small = depth.copy() inst_depth_small[~smaller_mask] = 0 inst_pc_small = unproject_pointcloud(inst_depth_small, intrinsic_open3d, T_CW) new_inst.pc = inst_pc_small new_inst.pc = new_inst.pc.voxel_down_sample(voxel_size) try: inst_bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(new_inst.pc.points) except RuntimeError: # print("too few pcs obj ", inst_id) inst_data[inst_mask] = 0 continue # scale up inst_bbox3D.scale(bbox3d_scale, inst_bbox3D.get_center()) new_inst.bbox3D = inst_bbox3D # update inst_dict inst_dict.update({inst_id: new_inst}) # init new sem # update inst_data inst_data[inst_mask] = inst_id if diff_mask is not None: inst_data[diff_mask] = -1 # unsure area return inst_data def load_matrix_from_txt(path, shape=(4, 4)): with open(path) as f: txt = f.readlines() txt = ''.join(txt).replace('\n', ' ') matrix = [float(v) for v in txt.split()] return np.array(matrix).reshape(shape) def check_mask_order(obj_masks, depth_np, obj_ids): print(len(obj_masks)) print(len(obj_ids)) assert len(obj_masks) == len(obj_ids) depth = torch.from_numpy(depth_np) obj_masked_modified = copy.deepcopy(obj_masks[:]) for i in range(len(obj_masks) - 1): mask1 = obj_masks[i].float() mask1_ = obj_masked_modified[i].float() for j in range(i + 1, len(obj_masks)): mask2 = obj_masks[j].float() mask2_ = obj_masked_modified[j].float() # case 1: if they don't intersect we don't touch them if ((mask1 + mask2) == 2).sum() == 0: continue # case 2: the entire object 1 is inside of object 2, we say object 1 is in front of object 2: elif (((mask1 + mask2) == 2).float() - mask1).sum() == 0: mask2_ -= mask1_ # case 3: the entire object 2 is inside of object 1, we say object 2 is in front of object 1: elif (((mask1 + mask2) == 2).float() - mask2).sum() == 0: mask1_ -= mask2_ # case 4: use depth to check object order: else: # object 1 is closer if (depth * mask1).sum() / mask1.sum() > (depth * mask2).sum() / mask2.sum(): mask2_ -= ((mask1 + mask2) == 2).float() # object 2 is closer if (depth * mask1).sum() / mask1.sum() < (depth * mask2).sum() / mask2.sum(): mask1_ -= ((mask1 + mask2) == 2).float() final_mask = torch.zeros_like(depth, dtype=torch.int) # instance_labels = {} for i in range(len(obj_masked_modified)): final_mask = final_mask.masked_fill(obj_masked_modified[i] > 0, obj_ids[i]) # instance_labels[i] = obj_ids[i].item() return final_mask.cpu().numpy() def unproject_pointcloud(depth, intrinsic_open3d, T_CW): # depth, mask, intrinsic, extrinsic -> point clouds pc_sample = open3d.geometry.PointCloud.create_from_depth_image(depth=open3d.geometry.Image(depth), intrinsic=intrinsic_open3d, extrinsic=T_CW, depth_scale=1.0, project_valid_depth_only=True) return pc_sample def check_inside_ratio(pc, bbox3D): # pc, bbox3d -> inside ratio indices = bbox3D.get_point_indices_within_bounding_box(pc.points) assert len(pc.points) > 0 ratio = len(indices) / len(pc.points) # print("ratio ", ratio) return ratio, indices def track_instance(masks, classes, depth, inst_list, sem_dict, intrinsic_open3d, T_CW, IoU_thresh=0.5, voxel_size=0.1, min_pixels=2000, erode=True, clip_features=None, class_names=None): device = masks.device inst_data_dict = {} inst_data_dict.update({0: torch.zeros(depth.shape, dtype=torch.int, device=device)}) inst_ids = [] bbox3d_scale = 1.0 # todo 1.0 min_extent = 0.05 depth = torch.from_numpy(depth).to(device) for i in range(len(masks)): inst_data = torch.zeros(depth.shape, dtype=torch.int, device=device) smaller_mask = cv2.erode(masks[i].detach().cpu().numpy().astype(np.uint8), np.ones((5, 5)), iterations=3).astype(bool) inst_depth_small = depth.detach().cpu().numpy() inst_depth_small[~smaller_mask] = 0 inst_pc_small = unproject_pointcloud(inst_depth_small, intrinsic_open3d, T_CW) diff_mask = None if np.sum(smaller_mask) <= min_pixels: # too small 20 400 # todo use sem to set background inst_data[masks[i]] = 0 # set to background continue inst_pc_voxel = inst_pc_small.voxel_down_sample(voxel_size) if len(inst_pc_voxel.points) <= 10: # too small 20 400 # todo use sem to set background inst_data[masks[i]] = 0 # set to background continue is_merged = False inst_id = None inst_mask = masks[i] #smaller_mask #masks[i] # todo only inst_class = classes[i] inst_depth = depth.detach().cpu().numpy() inst_depth[~masks[i].detach().cpu().numpy()] = 0. # inst_mask inst_pc = unproject_pointcloud(inst_depth, intrinsic_open3d, T_CW) sem_inst_list = [] if clip_features is not None: # check similar sems based on clip feature distance sem_thr = 200 #300. for table #320. # 260. for sem_exist in sem_dict.keys(): if torch.abs(clip_features[class_names[inst_class]] - clip_features[class_names[sem_exist]]).sum() < sem_thr: sem_inst_list.extend(sem_dict[sem_exist]) else: # no clip features, only do strictly sem check if inst_class in sem_dict.keys(): sem_inst_list.extend(sem_dict[inst_class]) for candidate_inst in sem_inst_list: # if True: # only consider 3D bbox, merge them if they are spatial together IoU, indices = check_inside_ratio(inst_pc, candidate_inst.bbox3D) candidate_inst.cmp_cnt += 1 if IoU > IoU_thresh: # merge inst to candidate is_merged = True candidate_inst.merge_cnt += 1 candidate_inst.pc += inst_pc.select_by_index(indices) # inst_uv = inst_pc.select_by_index(indices).project_to_depth_image(masks[i].shape[1], masks[i].shape[0], intrinsic_open3d, T_CW, depth_scale=1.0, depth_max=10.0) # # inst_uv = torch.utils.dlpack.from_dlpack(uv_opencv.as_tensor().to_dlpack()) # valid_mask = inst_uv.squeeze() > 0. # shape --> H, W # diff_mask = (inst_depth > 0.) & (~valid_mask) diff_mask = torch.zeros_like(inst_mask) uv_opencv, _ = cv2.projectPoints(np.array(inst_pc.select_by_index(indices).points), T_CW[:3, :3], T_CW[:3, 3], intrinsic_open3d.intrinsic_matrix[:3,:3], None) uv = np.round(uv_opencv).squeeze().astype(int) u = uv[:, 0].reshape(-1, 1) v = uv[:, 1].reshape(-1, 1) vu = np.concatenate([v, u], axis=-1) valid_mask = np.zeros(inst_mask.shape, dtype=np.bool) valid_mask[tuple(vu.T)] = True diff_mask[(inst_depth!=0) & (~valid_mask)] = True # downsample pcs candidate_inst.pc = candidate_inst.pc.voxel_down_sample(voxel_size) # candidate_inst.pc.random_down_sample(np.minimum(500//len(candidate_inst.pc.points),1)) candidate_inst.bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(candidate_inst.pc.points) # enlarge candidate_inst.bbox3D.scale(bbox3d_scale, candidate_inst.bbox3D.get_center()) candidate_inst.bbox3D.extent = np.maximum(candidate_inst.bbox3D.extent, min_extent) # at least bigger than min_extent inst_id = candidate_inst.inst_id break # if candidate_inst.cmp_cnt >= 20 and candidate_inst.merge_cnt <= 5: # sem_inst_list.remove(candidate_inst) if not is_merged: # init new inst and new sem new_inst = InstData() new_inst.inst_id = len(inst_list) + 1 new_inst.class_id = inst_class new_inst.pc = inst_pc_small new_inst.pc = new_inst.pc.voxel_down_sample(voxel_size) inst_bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(new_inst.pc.points) # scale up inst_bbox3D.scale(bbox3d_scale, inst_bbox3D.get_center()) inst_bbox3D.extent = np.maximum(inst_bbox3D.extent, min_extent) new_inst.bbox3D = inst_bbox3D inst_list.append(new_inst) inst_id = new_inst.inst_id # update sem_dict if inst_class in sem_dict.keys(): sem_dict[inst_class].append(new_inst) # append new inst to exist sem else: sem_dict.update({inst_class: [new_inst]}) # init new sem # update inst_data inst_data[inst_mask] = inst_id if diff_mask is not None: inst_data[diff_mask] = -1 # unsure area if inst_id not in inst_ids: inst_data_dict.update({inst_id: inst_data}) else: continue # idx = inst_ids.index(inst_id) # inst_data_list[idx] = inst_data_list[idx] & torch.from_numpy(inst_data) # merge them? todo # return inst_data mask_bg = torch.stack(list(inst_data_dict.values())).sum(0) != 0 inst_data_dict.update({0: mask_bg.int()}) return inst_data_dict ================================================ FILE: vis.py ================================================ import skimage.measure import trimesh import open3d as o3d import numpy as np def marching_cubes(occupancy, level=0.5): try: vertices, faces, vertex_normals, _ = skimage.measure.marching_cubes( #marching_cubes_lewiner( #marching_cubes( occupancy, level=level, gradient_direction='ascent') except (RuntimeError, ValueError): return None dim = occupancy.shape[0] vertices = vertices / (dim - 1) mesh = trimesh.Trimesh(vertices=vertices, vertex_normals=vertex_normals, faces=faces) return mesh def trimesh_to_open3d(src): dst = o3d.geometry.TriangleMesh() dst.vertices = o3d.utility.Vector3dVector(src.vertices) dst.triangles = o3d.utility.Vector3iVector(src.faces) vertex_colors = src.visual.vertex_colors[:, :3].astype(np.float) / 255.0 dst.vertex_colors = o3d.utility.Vector3dVector(vertex_colors) dst.compute_vertex_normals() return dst ================================================ FILE: vmap.py ================================================ import random import numpy as np import torch from time import perf_counter_ns from tqdm import tqdm import trainer import open3d import trimesh import scipy from bidict import bidict import copy import os import utils class performance_measure: def __init__(self, name) -> None: self.name = name def __enter__(self): self.start_time = perf_counter_ns() def __exit__(self, type, value, tb): self.end_time = perf_counter_ns() self.exec_time = self.end_time - self.start_time print(f"{self.name} excution time: {(self.exec_time)/1000000:.2f} ms") def origin_dirs_W(T_WC, dirs_C): assert T_WC.shape[0] == dirs_C.shape[0] assert T_WC.shape[1:] == (4, 4) assert dirs_C.shape[2] == 3 dirs_W = (T_WC[:, None, :3, :3] @ dirs_C[..., None]).squeeze() origins = T_WC[:, :3, -1] return origins, dirs_W # @torch.jit.script def stratified_bins(min_depth, max_depth, n_bins, n_rays, type=torch.float32, device = "cuda:0"): # type: (Tensor, Tensor, int, int) -> Tensor bin_limits_scale = torch.linspace(0, 1, n_bins+1, dtype=type, device=device) if not torch.is_tensor(min_depth): min_depth = torch.ones(n_rays, dtype=type, device=device) * min_depth if not torch.is_tensor(max_depth): max_depth = torch.ones(n_rays, dtype=type, device=device) * max_depth depth_range = max_depth - min_depth lower_limits_scale = depth_range[..., None] * bin_limits_scale + min_depth[..., None] lower_limits_scale = lower_limits_scale[:, :-1] assert lower_limits_scale.shape == (n_rays, n_bins) bin_length_scale = depth_range / n_bins increments_scale = torch.rand( n_rays, n_bins, device=device, dtype=torch.float32) * bin_length_scale[..., None] z_vals_scale = lower_limits_scale + increments_scale assert z_vals_scale.shape == (n_rays, n_bins) return z_vals_scale # @torch.jit.script def normal_bins_sampling(depth, n_bins, n_rays, delta, device = "cuda:0"): # type: (Tensor, int, int, float) -> Tensor # device = "cpu" # bins = torch.normal(0.0, delta / 3., size=[n_rays, n_bins], devi # self.keyframes_batch = torch.empty(self.n_keyframes,ce=device).sort().values bins = torch.empty(n_rays, n_bins, dtype=torch.float32, device=device).normal_(mean=0.,std=delta / 3.).sort().values bins = torch.clip(bins, -delta, delta) z_vals = depth[:, None] + bins assert z_vals.shape == (n_rays, n_bins) return z_vals class sceneObject: """ object instance mapping, updating keyframes, get training samples, optimizing MLP map """ def __init__(self, cfg, obj_id, rgb:torch.tensor, depth:torch.tensor, mask:torch.tensor, bbox_2d:torch.tensor, t_wc:torch.tensor, live_frame_id) -> None: self.do_bg = cfg.do_bg self.obj_id = obj_id self.data_device = cfg.data_device self.training_device = cfg.training_device assert rgb.shape[:2] == depth.shape assert rgb.shape[:2] == mask.shape assert bbox_2d.shape == (4,) assert t_wc.shape == (4, 4,) if self.do_bg and self.obj_id == 0: # do seperate bg self.obj_scale = cfg.bg_scale self.hidden_feature_size = cfg.hidden_feature_size_bg self.n_bins_cam2surface = cfg.n_bins_cam2surface_bg self.keyframe_step = cfg.keyframe_step_bg else: self.obj_scale = cfg.obj_scale self.hidden_feature_size = cfg.hidden_feature_size self.n_bins_cam2surface = cfg.n_bins_cam2surface self.keyframe_step = cfg.keyframe_step self.frames_width = rgb.shape[0] self.frames_height = rgb.shape[1] self.min_bound = cfg.min_depth self.max_bound = cfg.max_depth self.n_bins = cfg.n_bins self.n_unidir_funcs = cfg.n_unidir_funcs self.surface_eps = cfg.surface_eps self.stop_eps = cfg.stop_eps self.n_keyframes = 1 # Number of keyframes self.kf_pointer = None self.keyframe_buffer_size = cfg.keyframe_buffer_size self.kf_id_dict = bidict({live_frame_id:0}) self.kf_buffer_full = False self.frame_cnt = 0 # number of frames taken in self.lastest_kf_queue = [] self.bbox = torch.empty( # obj bounding bounding box in the frame self.keyframe_buffer_size, 4, device=self.data_device) # [u low, u high, v low, v high] self.bbox[0] = bbox_2d # RGB + pixel state batch self.rgb_idx = slice(0, 3) self.state_idx = slice(3, 4) self.rgbs_batch = torch.empty(self.keyframe_buffer_size, self.frames_width, self.frames_height, 4, dtype=torch.uint8, device=self.data_device) # Pixel states: self.other_obj = 0 # pixel doesn't belong to obj self.this_obj = 1 # pixel belong to obj self.unknown_obj = 2 # pixel state is unknown # Initialize first frame rgb and pixel state self.rgbs_batch[0, :, :, self.rgb_idx] = rgb self.rgbs_batch[0, :, :, self.state_idx] = mask[..., None] self.depth_batch = torch.empty(self.keyframe_buffer_size, self.frames_width, self.frames_height, dtype=torch.float32, device=self.data_device) # Initialize first frame's depth self.depth_batch[0] = depth self.t_wc_batch = torch.empty( self.keyframe_buffer_size, 4, 4, dtype=torch.float32, device=self.data_device) # world to camera transform # Initialize first frame's world2cam transform self.t_wc_batch[0] = t_wc # neural field map trainer_cfg = copy.deepcopy(cfg) trainer_cfg.obj_id = self.obj_id trainer_cfg.hidden_feature_size = self.hidden_feature_size trainer_cfg.obj_scale = self.obj_scale self.trainer = trainer.Trainer(trainer_cfg) # 3D boundary self.bbox3d = None self.pc = [] # init obj local frame # self.obj_center = self.init_obj_center(intrinsic, depth, mask, t_wc) self.obj_center = torch.tensor(0.0) # shouldn't make any difference because of frequency embedding def init_obj_center(self, intrinsic_open3d, depth, mask, t_wc): obj_depth = depth.cpu().clone() obj_depth[mask!=self.this_obj] = 0 T_CW = np.linalg.inv(t_wc.cpu().numpy()) pc_obj_init = open3d.geometry.PointCloud.create_from_depth_image( depth=open3d.geometry.Image(np.asarray(obj_depth.permute(1,0).numpy(), order="C")), intrinsic=intrinsic_open3d, extrinsic=T_CW, depth_trunc=self.max_bound, depth_scale=1.0) obj_center = torch.from_numpy(np.mean(pc_obj_init.points, axis=0)).float() return obj_center # @profile def append_keyframe(self, rgb:torch.tensor, depth:torch.tensor, mask:torch.tensor, bbox_2d:torch.tensor, t_wc:torch.tensor, frame_id:np.uint8=1): assert rgb.shape[:2] == depth.shape assert rgb.shape[:2] == mask.shape assert bbox_2d.shape == (4,) assert t_wc.shape == (4, 4,) assert self.n_keyframes <= self.keyframe_buffer_size - 1 assert rgb.dtype == torch.uint8 assert mask.dtype == torch.uint8 assert depth.dtype == torch.float32 # every kf_step choose one kf is_kf = (self.frame_cnt % self.keyframe_step == 0) or self.n_keyframes == 1 # print("---------------------") # print("self.kf_id_dict ", self.kf_id_dict) # print("live frame id ", frame_id) # print("n_frames ", self.n_keyframes) if self.n_keyframes == self.keyframe_buffer_size - 1: # kf buffer full, need to prune self.kf_buffer_full = True if self.kf_pointer is None: self.kf_pointer = self.n_keyframes self.rgbs_batch[self.kf_pointer, :, :, self.rgb_idx] = rgb self.rgbs_batch[self.kf_pointer, :, :, self.state_idx] = mask[..., None] self.depth_batch[self.kf_pointer, ...] = depth self.t_wc_batch[self.kf_pointer, ...] = t_wc self.bbox[self.kf_pointer, ...] = bbox_2d self.kf_id_dict.inv[self.kf_pointer] = frame_id if is_kf: self.lastest_kf_queue.append(self.kf_pointer) pruned_frame_id, pruned_kf_id = self.prune_keyframe() self.kf_pointer = pruned_kf_id print("pruned kf id ", self.kf_pointer) else: if not is_kf: # not kf, replace self.rgbs_batch[self.n_keyframes-1, :, :, self.rgb_idx] = rgb self.rgbs_batch[self.n_keyframes-1, :, :, self.state_idx] = mask[..., None] self.depth_batch[self.n_keyframes-1, ...] = depth self.t_wc_batch[self.n_keyframes-1, ...] = t_wc self.bbox[self.n_keyframes-1, ...] = bbox_2d self.kf_id_dict.inv[self.n_keyframes-1] = frame_id else: # is kf, add new kf self.kf_id_dict[frame_id] = self.n_keyframes self.rgbs_batch[self.n_keyframes, :, :, self.rgb_idx] = rgb self.rgbs_batch[self.n_keyframes, :, :, self.state_idx] = mask[..., None] self.depth_batch[self.n_keyframes, ...] = depth self.t_wc_batch[self.n_keyframes, ...] = t_wc self.bbox[self.n_keyframes, ...] = bbox_2d self.lastest_kf_queue.append(self.n_keyframes) self.n_keyframes += 1 # print("self.kf_id_dic ", self.kf_id_dict) self.frame_cnt += 1 if len(self.lastest_kf_queue) > 2: # keep latest two frames self.lastest_kf_queue = self.lastest_kf_queue[-2:] def prune_keyframe(self): # simple strategy to prune, randomly choose key, value = random.choice(list(self.kf_id_dict.items())[:-2]) # do not prune latest two frames return key, value def get_bound(self, intrinsic_open3d): # get 3D boundary from posed depth img todo update sparse pc when append frame pcs = open3d.geometry.PointCloud() for kf_id in range(self.n_keyframes): mask = self.rgbs_batch[kf_id, :, :, self.state_idx].squeeze() == self.this_obj depth = self.depth_batch[kf_id].cpu().clone() twc = self.t_wc_batch[kf_id].cpu().numpy() depth[~mask] = 0 depth = depth.permute(1,0).numpy().astype(np.float32) T_CW = np.linalg.inv(twc) pc = open3d.geometry.PointCloud.create_from_depth_image(depth=open3d.geometry.Image(np.asarray(depth, order="C")), intrinsic=intrinsic_open3d, extrinsic=T_CW) # self.pc += pc pcs += pc # # get minimal oriented 3d bbox # try: # bbox3d = open3d.geometry.OrientedBoundingBox.create_from_points(pcs.points) # except RuntimeError: # print("too few pcs obj ") # return None # trimesh has a better minimal bbox implementation than open3d try: transform, extents = trimesh.bounds.oriented_bounds(np.array(pcs.points)) # pc transform = np.linalg.inv(transform) except scipy.spatial._qhull.QhullError: print("too few pcs obj ") return None for i in range(extents.shape[0]): extents[i] = np.maximum(extents[i], 0.10) # at least rendering 10cm bbox = utils.BoundingBox() bbox.center = transform[:3, 3] bbox.R = transform[:3, :3] bbox.extent = extents bbox3d = open3d.geometry.OrientedBoundingBox(bbox.center, bbox.R, bbox.extent) min_extent = 0.05 bbox3d.extent = np.maximum(min_extent, bbox3d.extent) bbox3d.color = (255,0,0) self.bbox3d = utils.bbox_open3d2bbox(bbox_o3d=bbox3d) # self.pc = [] print("obj ", self.obj_id) print("bound ", bbox3d) print("kf id dict ", self.kf_id_dict) # open3d.visualization.draw_geometries([bbox3d, pcs]) return bbox3d def get_training_samples(self, n_frames, n_samples, cached_rays_dir): # Sample pixels if self.n_keyframes > 2: # make sure latest 2 frames are sampled todo if kf pruned, this is not the latest frame keyframe_ids = torch.randint(low=0, high=self.n_keyframes, size=(n_frames - 2,), dtype=torch.long, device=self.data_device) # if self.kf_buffer_full: # latest_frame_ids = list(self.kf_id_dict.values())[-2:] latest_frame_ids = self.lastest_kf_queue[-2:] keyframe_ids = torch.cat([keyframe_ids, torch.tensor(latest_frame_ids, device=keyframe_ids.device)]) # print("latest_frame_ids", latest_frame_ids) # else: # sample last 2 frames # keyframe_ids = torch.cat([keyframe_ids, # torch.tensor([self.n_keyframes-2, self.n_keyframes-1], device=keyframe_ids.device)]) else: keyframe_ids = torch.randint(low=0, high=self.n_keyframes, size=(n_frames,), dtype=torch.long, device=self.data_device) keyframe_ids = torch.unsqueeze(keyframe_ids, dim=-1) idx_w = torch.rand(n_frames, n_samples, device=self.data_device) idx_h = torch.rand(n_frames, n_samples, device=self.data_device) # resizing idx_w and idx_h to be in the bbox range idx_w = idx_w * (self.bbox[keyframe_ids, 1] - self.bbox[keyframe_ids, 0]) + self.bbox[keyframe_ids, 0] idx_h = idx_h * (self.bbox[keyframe_ids, 3] - self.bbox[keyframe_ids, 2]) + self.bbox[keyframe_ids, 2] idx_w = idx_w.long() idx_h = idx_h.long() sampled_rgbs = self.rgbs_batch[keyframe_ids, idx_w, idx_h] sampled_depth = self.depth_batch[keyframe_ids, idx_w, idx_h] # Get ray directions for sampled pixels sampled_ray_dirs = cached_rays_dir[idx_w, idx_h] # Get sampled keyframe poses sampled_twc = self.t_wc_batch[keyframe_ids[:, 0], :, :] origins, dirs_w = origin_dirs_W(sampled_twc, sampled_ray_dirs) return self.sample_3d_points(sampled_rgbs, sampled_depth, origins, dirs_w) def sample_3d_points(self, sampled_rgbs, sampled_depth, origins, dirs_w): """ 3D sampling strategy * For pixels with invalid depth: - N+M from minimum bound to max (stratified) * For pixels with valid depth: # Pixel belongs to this object - N from cam to surface (stratified) - M around surface (stratified/normal) # Pixel belongs that don't belong to this object - N from cam to surface (stratified) - M around surface (stratified) # Pixel with unknown state - Do nothing! """ n_bins_cam2surface = self.n_bins_cam2surface n_bins = self.n_bins eps = self.surface_eps other_objs_max_eps = self.stop_eps #0.05 # todo 0.02 # print("max depth ", torch.max(sampled_depth)) sampled_z = torch.zeros( sampled_rgbs.shape[0] * sampled_rgbs.shape[1], n_bins_cam2surface + n_bins, dtype=self.depth_batch.dtype, device=self.data_device) # shape (N*n_rays, n_bins_cam2surface + n_bins) invalid_depth_mask = (sampled_depth <= self.min_bound).view(-1) # max_bound = self.max_bound max_bound = torch.max(sampled_depth) # sampling for points with invalid depth invalid_depth_count = invalid_depth_mask.count_nonzero() if invalid_depth_count: sampled_z[invalid_depth_mask, :] = stratified_bins( self.min_bound, max_bound, n_bins_cam2surface + n_bins, invalid_depth_count, device=self.data_device) # sampling for valid depth rays valid_depth_mask = ~invalid_depth_mask valid_depth_count = valid_depth_mask.count_nonzero() if valid_depth_count: # Sample between min bound and depth for all pixels with valid depth sampled_z[valid_depth_mask, :n_bins_cam2surface] = stratified_bins( self.min_bound, sampled_depth.view(-1)[valid_depth_mask]-eps, n_bins_cam2surface, valid_depth_count, device=self.data_device) # sampling around depth for this object obj_mask = (sampled_rgbs[..., -1] == self.this_obj).view(-1) & valid_depth_mask # todo obj_mask assert sampled_z.shape[0] == obj_mask.shape[0] obj_count = obj_mask.count_nonzero() if obj_count: sampling_method = "normal" # stratified or normal if sampling_method == "stratified": sampled_z[obj_mask, n_bins_cam2surface:] = stratified_bins( sampled_depth.view(-1)[obj_mask] - eps, sampled_depth.view(-1)[obj_mask] + eps, n_bins, obj_count, device=self.data_device) elif sampling_method == "normal": sampled_z[obj_mask, n_bins_cam2surface:] = normal_bins_sampling( sampled_depth.view(-1)[obj_mask], n_bins, obj_count, delta=eps, device=self.data_device) else: raise ( f"sampling method not implemented {sampling_method}, \ stratified and normal sampling only currenty implemented." ) # sampling around depth of other objects other_obj_mask = (sampled_rgbs[..., -1] != self.this_obj).view(-1) & valid_depth_mask other_objs_count = other_obj_mask.count_nonzero() if other_objs_count: sampled_z[other_obj_mask, n_bins_cam2surface:] = stratified_bins( sampled_depth.view(-1)[other_obj_mask] - eps, sampled_depth.view(-1)[other_obj_mask] + other_objs_max_eps, n_bins, other_objs_count, device=self.data_device) sampled_z = sampled_z.view(sampled_rgbs.shape[0], sampled_rgbs.shape[1], -1) # view as (n_rays, n_samples, 10) input_pcs = origins[..., None, None, :] + (dirs_w[:, :, None, :] * sampled_z[..., None]) input_pcs -= self.obj_center obj_labels = sampled_rgbs[..., -1].view(-1) return sampled_rgbs[..., :3], sampled_depth, valid_depth_mask, obj_labels, input_pcs, sampled_z def save_checkpoints(self, path, epoch): obj_id = self.obj_id chechpoint_load_file = (path + "/obj_" + str(obj_id) + "_frame_" + str(epoch) + ".pth") torch.save( { "epoch": epoch, "FC_state_dict": self.trainer.fc_occ_map.state_dict(), "PE_state_dict": self.trainer.pe.state_dict(), "obj_id": self.obj_id, "bbox": self.bbox3d, "obj_scale": self.trainer.obj_scale }, chechpoint_load_file, ) # optimiser? def load_checkpoints(self, ckpt_file): checkpoint_load_file = (ckpt_file) if not os.path.exists(checkpoint_load_file): print("ckpt not exist ", checkpoint_load_file) return checkpoint = torch.load(checkpoint_load_file) self.trainer.fc_occ_map.load_state_dict(checkpoint["FC_state_dict"]) self.trainer.pe.load_state_dict(checkpoint["PE_state_dict"]) self.obj_id = checkpoint["obj_id"] self.bbox3d = checkpoint["bbox"] self.trainer.obj_scale = checkpoint["obj_scale"] self.trainer.fc_occ_map.to(self.training_device) self.trainer.pe.to(self.training_device) class cameraInfo: def __init__(self, cfg) -> None: self.device = cfg.data_device self.width = cfg.W # Frame width self.height = cfg.H # Frame height self.fx = cfg.fx self.fy = cfg.fy self.cx = cfg.cx self.cy = cfg.cy self.rays_dir_cache = self.get_rays_dirs() def get_rays_dirs(self, depth_type="z"): idx_w = torch.arange(end=self.width, device=self.device) idx_h = torch.arange(end=self.height, device=self.device) dirs = torch.ones((self.width, self.height, 3), device=self.device) dirs[:, :, 0] = ((idx_w - self.cx) / self.fx)[:, None] dirs[:, :, 1] = ((idx_h - self.cy) / self.fy) if depth_type == "euclidean": raise Exception( "Get camera rays directions with euclidean depth not yet implemented" ) norm = torch.norm(dirs, dim=-1) dirs = dirs * (1. / norm)[:, :, :, None] return dirs