[
  {
    "path": ".gitignore",
    "content": "/.anaconda3/\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\nHierarchicalPriors.sublime-project\nHierarchicalPriors.sublime-workspace\nscripts/res1/\nscripts/results/\nscripts/traj.txt\nvids_ims\nres_datasets/\nresults/\nScenePriors/train/examples/experiments/\n\n*.pkl\n*.idea\n"
  },
  {
    "path": "LICENSE",
    "content": "vMAP SOFTWARE\n\nLICENCE AGREEMENT\n\nWE (Imperial College of Science, Technology and Medicine, (“Imperial College\nLondon”)) ARE WILLING TO LICENSE THIS SOFTWARE TO YOU (a licensee “You”) ONLY\nON THE CONDITION THAT YOU ACCEPT ALL OF THE TERMS CONTAINED IN THE FOLLOWING\nAGREEMENT. PLEASE READ THE AGREEMENT CAREFULLY BEFORE DOWNLOADING THE SOFTWARE.\nBY EXERCISING THE OPTION TO DOWNLOAD THE SOFTWARE YOU AGREE TO BE BOUND BY THE\nTERMS OF THE AGREEMENT.\n\nSOFTWARE LICENCE AGREEMENT (EXCLUDING BSD COMPONENTS)\n\n1.This Agreement pertains to a worldwide, non-exclusive, temporary, fully\npaid-up, royalty free, non-transferable, non-sub- licensable licence (the\n“Licence”) to use the elastic fusion source code, including any modification,\npart or derivative (the “Software”).\n\nOwnership and Licence. Your rights to use and download the Software onto your\ncomputer, and all other copies that You are authorised to make, are specified\nin this Agreement. However, we (or our licensors) retain all rights, including\nbut not limited to all copyright and other intellectual property rights\nanywhere in the world, in the Software not expressly granted to You in this\nAgreement.\n\n2. Permitted use of the Licence:\n\n(a) You may download and install the Software onto one computer or server for\nuse in accordance with Clause 2(b) of this Agreement provided that You ensure\nthat the Software is not accessible by other users unless they have themselves\naccepted the terms of this licence agreement.\n\n(b) You may use the Software solely for non-commercial, internal  or academic\nresearch purposes and only in accordance with the terms of this Agreement. You\nmay not use the Software for commercial purposes, including but not limited to\n(1) integration of all or part of the source code or the Software into a\nproduct for sale or licence by or on behalf of You to third parties or (2) use\nof the Software or any derivative of it for research to develop software\nproducts for sale or licence to a third party or (3) use of the Software or any\nderivative of it for research to develop non-software products for sale or\nlicence to a third party, or (4) use of the Software to provide any service to\nan external organisation for which payment is received.\n\nShould You wish to use the Software for commercial purposes, You shall\nemail researchcontracts.engineering@imperial.ac.uk .\n\n(c) Right to Copy. You may copy the Software for back-up and archival purposes,\nprovided that each copy is kept in your possession and provided You reproduce\nour copyright notice (set out in Schedule 1) on each copy.\n\n(d) Transfer and sub-licensing. You may not rent, lend, or lease the Software\nand You may not transmit, transfer or sub-license this licence to use the\nSoftware or any of your rights or obligations under this Agreement to another\nparty.\n\n(e) Identity of Licensee. The licence granted herein is personal to You. You\nshall not permit any third party to access, modify or otherwise use the\nSoftware nor shall You access modify or otherwise use the Software on behalf of\nany third party. If You wish to obtain a licence for mutiple users or a site\nlicence for the Software please contact us\nat researchcontracts.engineering@imperial.ac.uk .\n\n(f) Publications and presentations. You may make public, results or data\nobtained from, dependent on or arising from research carried out using the\nSoftware, provided that any such presentation or publication identifies the\nSoftware as the source of the results or the data, including the Copyright\nNotice given in each element of the Software, and stating that the Software has\nbeen made available for use by You under licence from Imperial College London\nand You provide a copy of any such publication to Imperial College London.\n\n3. Prohibited Uses. You may not, without written permission from us\nat researchcontracts.engineering@imperial.ac.uk :\n\n(a) Use, copy, modify, merge, or transfer copies of the Software or any\ndocumentation provided by us which relates to the Software except as provided\nin this Agreement;\n\n(b) Use any back-up or archival copies of the Software (or allow anyone else to\nuse such copies) for any purpose other than to replace the original copy in the\nevent it is destroyed or becomes defective; or\n\n(c) Disassemble, decompile or \"unlock\", reverse translate, or in any manner\ndecode the Software for any reason.\n\n4. Warranty Disclaimer\n\n(a) Disclaimer. The Software has been developed for research purposes only. You\nacknowledge that we are providing the Software to You under this licence\nagreement free of charge and on condition that the disclaimer set out below\nshall apply. We do not represent or warrant that the Software as to: (i) the\nquality, accuracy or reliability of the Software; (ii) the suitability of the\nSoftware for any particular use or for use under any specific conditions; and\n(iii) whether use of the Software will infringe third-party rights.\n\nYou acknowledge that You have reviewed and evaluated the Software to determine\nthat it meets your needs and that You assume all responsibility and liability\nfor determining the suitability of the Software as fit for your particular\npurposes and requirements. Subject to Clause 4(b), we exclude and expressly\ndisclaim all express and implied representations, warranties, conditions and\nterms not stated herein (including the implied conditions or warranties of\nsatisfactory quality, merchantable quality, merchantability and fitness for\npurpose).\n\n(b) Savings. Some jurisdictions may imply warranties, conditions or terms or\nimpose obligations upon us which cannot, in whole or in part, be excluded,\nrestricted or modified or otherwise do not allow the exclusion of implied\nwarranties, conditions or terms, in which case the above warranty disclaimer\nand exclusion will only apply to You to the extent permitted in the relevant\njurisdiction and does not in any event exclude any implied warranties,\nconditions or terms which may not under applicable law be excluded.\n\n(c) Imperial College London disclaims all responsibility for the use which is\nmade of the Software and any liability for the outcomes arising from using the\nSoftware.\n\n5. Limitation of Liability\n\n(a) You acknowledge that we are providing the Software to You under this\nlicence agreement free of charge and on condition that the limitation of\nliability set out below shall apply. Accordingly, subject to Clause 5(b), we\nexclude all liability whether in contract, tort, negligence or otherwise, in\nrespect of the Software and/or any related documentation provided to You by us\nincluding, but not limited to, liability for loss or corruption of data, loss\nof contracts, loss of income, loss of profits, loss of cover and any\nconsequential or indirect loss or damage of any kind arising out of or in\nconnection with this licence agreement, however caused. This exclusion shall\napply even if we have been advised of the possibility of such loss or damage.\n\n(b) You agree to indemnify Imperial College London and hold it harmless from\nand against any and all claims, damages and liabilities asserted by third\nparties (including claims for negligence) which arise directly or indirectly\nfrom the use of the Software or any derivative of it or the sale of any\nproducts based on the Software. You undertake to make no liability claim\nagainst any employee, student, agent or appointee of Imperial College London,\nin connection with this Licence or the Software.\n\n(c) Nothing in this Agreement shall have the effect of excluding or limiting\nour statutory liability.\n\n(d) Some jurisdictions do not allow these limitations or exclusions either\nwholly or in part, and, to that extent, they may not apply to you. Nothing in\nthis licence agreement will affect your statutory rights or other relevant\nstatutory provisions which cannot be excluded, restricted or modified, and its\nterms and conditions must be read and construed subject to any such statutory\nrights and/or provisions.\n\n6. Confidentiality. You agree not to disclose any confidential information\nprovided to You by us pursuant to this Agreement to any third party without our\nprior written consent. The obligations in this Clause 6 shall survive the\ntermination of this Agreement for any reason.\n\n7. Termination.\n\n(a) We may terminate this licence agreement and your right to use the Software\nat any time with immediate effect upon written notice to You.\n\n(b) This licence agreement and your right to use the Software automatically\nterminate if You:\n\n  (i) fail to comply with any provisions of this Agreement; or\n\n  (ii) destroy the copies of the Software in your possession, or voluntarily\n  return the Software to us.\n\n(c) Upon termination You will destroy all copies of the Software.\n\n(d) Otherwise, the restrictions on your rights to use the Software will expire\n10 (ten) years after first use of the Software under this licence agreement.\n\n8. Miscellaneous Provisions.\n\n(a) This Agreement will be governed by and construed in accordance with the\nsubstantive laws of England and Wales whose courts shall have exclusive\njurisdiction over all disputes which may arise between us.\n\n(b) This is the entire agreement between us relating to the Software, and\nsupersedes any prior purchase order, communications, advertising or\nrepresentations concerning the Software.\n\n(c) No change or modification of this Agreement will be valid unless it is in\nwriting, and is signed by us.\n\n(d) The unenforceability or invalidity of any part of this Agreement will not\naffect the enforceability or validity of the remaining parts.\n\nBSD Elements of the Software\n\nFor BSD elements of the Software, the following terms shall apply:\nCopyright as indicated in the header of the individual element of the Software.\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright notice, this\nlist of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright notice,\nthis list of conditions and the following disclaimer in the documentation\nand/or other materials provided with the distribution.\n\n3. Neither the name of the copyright holder nor the names of its contributors\nmay be used to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\nANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\nWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n \nSCHEDULE 1\n\nThe Software\n\nvMAP is an object-level mapping system with each object represented by a small MLP, that can be efficiently vectorised trained.\nIt is based on the techniques described in the following publication:\n\n    • Xin Kong, Shikun Liu, Marwan Taher, Andrew J. Davison. vMAP: Vectorised Object Mapping for Neural Field SLAM. ArXiv Preprint, 2023\n_________________________\n\nAcknowledgments\n\nIf you use the software, you should reference the following paper in any\npublication:\n \n    • Xin Kong, Shikun Liu, Marwan Taher, Andrew J. Davison. vMAP: Vectorised Object Mapping for Neural Field SLAM. ArXiv Preprint, 2023"
  },
  {
    "path": "README.md",
    "content": "[comment]: <> (# vMAP: Vectorised Object Mapping for Neural Field SLAM)\n\n<!-- PROJECT LOGO -->\n\n<p align=\"center\">\n\n  <h1 align=\"center\">vMAP: Vectorised Object Mapping for Neural Field SLAM</h1>\n  <p align=\"center\">\n    <a href=\"https://kxhit.github.io\"><strong>Xin Kong</strong></a>\n    ·\n    <a href=\"https://shikun.io\"><strong>Shikun Liu</strong></a>\n    ·\n    <a href=\"https://marwan99.github.io/\"><strong>Marwan Taher</strong></a>\n    ·\n    <a href=\"https://www.doc.ic.ac.uk/~ajd/\"><strong>Andrew Davison</strong></a>\n  </p>\n\n[comment]: <> (  <h2 align=\"center\">PAPER</h2>)\n  <h3 align=\"center\"><a href=\"https://arxiv.org/abs/2302.01838\">Paper</a> | <a href=\"https://youtu.be/_H_jNnhUAsE\">Video</a> | <a href=\"https://kxhit.github.io/vMAP\">Project Page</a></h3>\n  <div align=\"center\"></div>\n\n<p align=\"center\">\n  <a href=\"\">\n    <img src=\"./media/teaser.png\" alt=\"Logo\" width=\"80%\">\n  </a>\n</p>\n<p align=\"center\">\nvMAP builds an object-level map from a real-time RGB-D input stream. Each object is represented by a separate MLP neural field model, all optimised in parallel via vectorised training. \n</p>\n<br>\n\nWe provide the implementation of the following neural-field SLAM frameworks:\n- **vMAP** [Official Implementation] \n- **iMAP** [Simplified and Improved Re-Implementation, with depth guided sampling]\n\n\n\n## Install\nFirst, let's start with a virtual environment with the required dependencies.\n```bash\nconda env create -f environment.yml\n```\n\n## Dataset\nPlease download the following datasets to reproduce our results.\n\n* [Replica Demo](https://huggingface.co/datasets/kxic/vMAP/resolve/main/demo_replica_room_0.zip) - Replica Room 0 only for faster experimentation.\n* [Replica](https://huggingface.co/datasets/kxic/vMAP/resolve/main/vmap.zip) - All Pre-generated Replica sequences. For Replica data generation, please refer to directory `data_generation`.\n* [ScanNet](https://github.com/ScanNet/ScanNet) - Official ScanNet sequences.\nEach dataset contains a sequence of RGB-D images, as well as their corresponding camera poses, and object instance labels.\nTo extract data from ScanNet .sens files, run\n    ```bash\n    conda activate py2\n    python2 reader.py --filename ~/data/ScanNet/scannet/scans/scene0024_00/scene0024_00.sens --output_path ~/data/ScanNet/objnerf/ --export_depth_images --export_color_images --export_poses --export_intrinsics\n    ```\n\n## Config\n\nThen update the config files in `configs/.json` with your dataset paths, as well as other training hyper-parameters.\n```json\n\"dataset\": {\n        \"path\": \"path/to/ims/folder/\",\n    }\n```\n\n## Running vMAP / iMAP\nThe following commands will run vMAP / iMAP in a single-thread setting.\n\n#### vMAP\n```bash\npython ./train.py --config ./configs/Replica/config_replica_room0_vMAP.json --logdir ./logs/vMAP/room0 --save_ckpt True\n```\n#### iMAP\n```bash\npython ./train.py --config ./configs/Replica/config_replica_room0_iMAP.json --logdir ./logs/iMAP/room0 --save_ckpt True\n```\n\n[comment]: <> (#### Multi thread demo)\n\n[comment]: <> (```bash)\n\n[comment]: <> (./parallel_train.py --config \"config_file.json\" --logdir ./logs)\n\n[comment]: <> (```)\n\n## Evaluation\nTo evaluate the quality of reconstructed scenes, we provide two different methods,\n#### 3D Scene-level Evaluation\nThe same metrics following the original iMAP, to compare with GT scene meshes by **Accuracy**, **Completion** and **Completion Ratio**.\n```bash\npython ./metric/eval_3D_scene.py\n```\n#### 3D Object-level Evaluation\nWe also provide the object-level metrics by computing the same metrics but averaging across all objects in a scene.\n```bash\npython ./metric/eval_3D_obj.py\n```\n\n[comment]: <> (### Novel View Synthesis)\n\n[comment]: <> (##### 2D Novel View Eval)\n\n[comment]: <> (We rendered a new trajectory in each scene and randomly choose novel view pose from it, evaluating 2D rendering performance)\n\n[comment]: <> (```bash)\n\n[comment]: <> (./metric/eval_2D_view.py)\n\n[comment]: <> (```)\n\n## Results\nWe provide raw results, including 3D meshes, 2D novel view rendering, and evaluated metrics of vMAP and iMAP* for easier comparison.\n\n* [Replica](https://huggingface.co/datasets/kxic/vMAP/resolve/main/vMAP_Replica_Results.zip)\n\n## Acknowledgement\nWe would like thank the following open-source repositories that we have build upon for the implementation of this work: [NICE-SLAM](https://github.com/cvg/nice-slam), and [functorch](https://github.com/pytorch/functorch).\n\n## Citation\nIf you found this code/work to be useful in your own research, please considering citing the following:\n```bibtex\n@article{kong2023vmap,\n  title={vMAP: Vectorised Object Mapping for Neural Field SLAM},\n  author={Kong, Xin and Liu, Shikun and Taher, Marwan and Davison, Andrew J},\n  journal={arXiv preprint arXiv:2302.01838},\n  year={2023}\n}\n```\n\n```bibtex\n@inproceedings{sucar2021imap,\n  title={iMAP: Implicit mapping and positioning in real-time},\n  author={Sucar, Edgar and Liu, Shikun and Ortiz, Joseph and Davison, Andrew J},\n  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},\n  pages={6229--6238},\n  year={2021}\n}\n```\n\n"
  },
  {
    "path": "cfg.py",
    "content": "import json\nimport numpy as np\nimport os\nimport utils\n\nclass Config:\n    def __init__(self, config_file):\n        # setting params\n        with open(config_file) as json_file:\n            config = json.load(json_file)\n\n        # training strategy\n        self.do_bg = bool(config[\"trainer\"][\"do_bg\"])\n        self.training_device = config[\"trainer\"][\"train_device\"]\n        self.data_device = config[\"trainer\"][\"data_device\"]\n        self.max_n_models = config[\"trainer\"][\"n_models\"]\n        self.live_mode = bool(config[\"dataset\"][\"live\"])\n        self.keep_live_time = config[\"dataset\"][\"keep_alive\"]\n        self.imap_mode = config[\"trainer\"][\"imap_mode\"]\n        self.training_strategy = config[\"trainer\"][\"training_strategy\"]  # \"forloop\" \"vmap\"\n        self.obj_id = -1\n\n        # dataset setting\n        self.dataset_format = config[\"dataset\"][\"format\"]\n        self.dataset_dir = config[\"dataset\"][\"path\"]\n        self.depth_scale = 1 / config[\"trainer\"][\"scale\"]\n        # camera setting\n        self.max_depth = config[\"render\"][\"depth_range\"][1]\n        self.min_depth = config[\"render\"][\"depth_range\"][0]\n        self.mh = config[\"camera\"][\"mh\"]\n        self.mw = config[\"camera\"][\"mw\"]\n        self.height = config[\"camera\"][\"h\"]\n        self.width = config[\"camera\"][\"w\"]\n        self.H = self.height - 2 * self.mh\n        self.W = self.width - 2 * self.mw\n        if \"fx\" in config[\"camera\"]:\n            self.fx = config[\"camera\"][\"fx\"]\n            self.fy = config[\"camera\"][\"fy\"]\n            self.cx = config[\"camera\"][\"cx\"] - self.mw\n            self.cy = config[\"camera\"][\"cy\"] - self.mh\n        else:   # for scannet\n            intrinsic = utils.load_matrix_from_txt(os.path.join(self.dataset_dir, \"intrinsic/intrinsic_depth.txt\"))\n            self.fx = intrinsic[0, 0]\n            self.fy = intrinsic[1, 1]\n            self.cx = intrinsic[0, 2] - self.mw\n            self.cy = intrinsic[1, 2] - self.mh\n        if \"distortion\" in config[\"camera\"]:\n            self.distortion_array = np.array(config[\"camera\"][\"distortion\"])\n        elif \"k1\" in config[\"camera\"]:\n            k1 = config[\"camera\"][\"k1\"]\n            k2 = config[\"camera\"][\"k2\"]\n            k3 = config[\"camera\"][\"k3\"]\n            k4 = config[\"camera\"][\"k4\"]\n            k5 = config[\"camera\"][\"k5\"]\n            k6 = config[\"camera\"][\"k6\"]\n            p1 = config[\"camera\"][\"p1\"]\n            p2 = config[\"camera\"][\"p2\"]\n            self.distortion_array = np.array([k1, k2, p1, p2, k3, k4, k5, k6])\n        else:\n            self.distortion_array = None\n\n        # training setting\n        self.win_size = config[\"model\"][\"window_size\"]\n        self.n_iter_per_frame = config[\"render\"][\"iters_per_frame\"]\n        self.n_per_optim = config[\"render\"][\"n_per_optim\"]\n        self.n_samples_per_frame = self.n_per_optim // self.win_size\n        self.win_size_bg = config[\"model\"][\"window_size_bg\"]\n        self.n_per_optim_bg = config[\"render\"][\"n_per_optim_bg\"]\n        self.n_samples_per_frame_bg = self.n_per_optim_bg // self.win_size_bg\n        self.keyframe_buffer_size = config[\"model\"][\"keyframe_buffer_size\"]\n        self.keyframe_step = config[\"model\"][\"keyframe_step\"]\n        self.keyframe_step_bg = config[\"model\"][\"keyframe_step_bg\"]\n        self.obj_scale = config[\"model\"][\"obj_scale\"]\n        self.bg_scale = config[\"model\"][\"bg_scale\"]\n        self.hidden_feature_size = config[\"model\"][\"hidden_feature_size\"]\n        self.hidden_feature_size_bg = config[\"model\"][\"hidden_feature_size_bg\"]\n        self.n_bins_cam2surface = config[\"render\"][\"n_bins_cam2surface\"]\n        self.n_bins_cam2surface_bg = config[\"render\"][\"n_bins_cam2surface_bg\"]\n        self.n_bins = config[\"render\"][\"n_bins\"]\n        self.n_unidir_funcs = config[\"model\"][\"n_unidir_funcs\"]\n        self.surface_eps = config[\"model\"][\"surface_eps\"]\n        self.stop_eps = config[\"model\"][\"other_eps\"]\n\n        # optimizer setting\n        self.learning_rate = config[\"optimizer\"][\"args\"][\"lr\"]\n        self.weight_decay = config[\"optimizer\"][\"args\"][\"weight_decay\"]\n\n        # vis setting\n        self.vis_device = config[\"vis\"][\"vis_device\"]\n        self.n_vis_iter = config[\"vis\"][\"n_vis_iter\"]\n        self.live_voxel_size = config[\"vis\"][\"live_voxel_size\"]\n        self.grid_dim = config[\"vis\"][\"grid_dim\"]"
  },
  {
    "path": "configs/Replica/config_replica_room0_iMAP.json",
    "content": "{\n    \"dataset\": {\n        \"live\": 0,\n        \"path\": \"/home/xin/data/Replica/vmap/room_0/imap/00\",\n        \"format\": \"Replica\",\n        \"keep_alive\": 20\n    },\n    \"optimizer\": {\n        \"args\":{\n            \"lr\": 0.001,\n            \"weight_decay\": 0.013,\n            \"pose_lr\": 0.001\n        }\n    },\n    \"trainer\": {\n        \"imap_mode\": 1,\n        \"do_bg\": 0,\n        \"n_models\": 1,\n        \"train_device\": \"cuda:0\",\n        \"data_device\": \"cuda:0\",\n        \"training_strategy\": \"vmap\",\n        \"epochs\": 1000000,\n        \"scale\": 1000.0\n    },\n    \"render\": {\n        \"depth_range\": [0.0, 8.0],\n        \"n_bins\": 9,\n        \"n_bins_cam2surface\": 5,\n        \"n_bins_cam2surface_bg\": 5,\n        \"iters_per_frame\": 20,\n        \"n_per_optim\": 4800,\n        \"n_per_optim_bg\": 1200\n    },\n    \"model\": {\n        \"n_unidir_funcs\": 5,\n        \"obj_scale\": 5.0,\n        \"bg_scale\": 5.0,\n        \"color_scaling\": 5.0,\n        \"opacity_scaling\": 10.0,\n        \"gt_scene\": 1,\n        \"surface_eps\": 0.1,\n        \"other_eps\": 0.05,\n        \"keyframe_buffer_size\": 20,\n        \"keyframe_step\": 50,\n        \"keyframe_step_bg\": 50,\n        \"window_size\": 5,\n        \"window_size_bg\": 10,\n        \"hidden_layers_block\": 1,\n        \"hidden_feature_size\": 256,\n        \"hidden_feature_size_bg\": 128\n    },\n    \"camera\": {\n        \"w\": 1200,\n        \"h\": 680,\n        \"fx\": 600.0,\n        \"fy\": 600.0,\n        \"cx\": 599.5,\n        \"cy\": 339.5,\n        \"mw\": 0,\n        \"mh\": 0\n    },\n    \"vis\": {\n        \"vis_device\": \"cuda:0\",\n        \"n_vis_iter\": 500,\n        \"n_bins_fine_vis\": 10,\n        \"im_vis_reduce\": 10,\n        \"grid_dim\": 256,\n        \"live_vis\": 1,\n        \"live_voxel_size\": 0.005\n    }\n}\n"
  },
  {
    "path": "configs/Replica/config_replica_room0_vMAP.json",
    "content": "{\n    \"dataset\": {\n        \"live\": 0,\n        \"path\": \"/home/xin/data/Replica/vmap/room_0/imap/00\",\n        \"format\": \"Replica\",\n        \"keep_alive\": 20\n    },\n    \"optimizer\": {\n        \"args\":{\n            \"lr\": 0.001,\n            \"weight_decay\": 0.013,\n            \"pose_lr\": 0.001\n        }\n    },\n    \"trainer\": {\n        \"imap_mode\": 0,\n        \"do_bg\": 1,\n        \"n_models\": 100,\n        \"train_device\": \"cuda:0\",\n        \"data_device\": \"cuda:0\",\n        \"training_strategy\": \"vmap\",\n        \"epochs\": 1000000,\n        \"scale\": 1000.0\n    },\n    \"render\": {\n        \"depth_range\": [0.0, 8.0],\n        \"n_bins\": 9,\n        \"n_bins_cam2surface\": 1,\n        \"n_bins_cam2surface_bg\": 5,\n        \"iters_per_frame\": 20,\n        \"n_per_optim\": 120,\n        \"n_per_optim_bg\": 1200\n    },\n    \"model\": {\n        \"n_unidir_funcs\": 5,\n        \"obj_scale\": 2.0,\n        \"bg_scale\": 5.0,\n        \"color_scaling\": 5.0,\n        \"opacity_scaling\": 10.0,\n        \"gt_scene\": 1,\n        \"surface_eps\": 0.1,\n        \"other_eps\": 0.05,\n        \"keyframe_buffer_size\": 20,\n        \"keyframe_step\": 25,\n        \"keyframe_step_bg\": 50,\n        \"window_size\": 5,\n        \"window_size_bg\": 10,\n        \"hidden_layers_block\": 1,\n        \"hidden_feature_size\": 32,\n        \"hidden_feature_size_bg\": 128\n    },\n    \"camera\": {\n        \"w\": 1200,\n        \"h\": 680,\n        \"fx\": 600.0,\n        \"fy\": 600.0,\n        \"cx\": 599.5,\n        \"cy\": 339.5,\n        \"mw\": 0,\n        \"mh\": 0\n    },\n    \"vis\": {\n        \"vis_device\": \"cuda:0\",\n        \"n_vis_iter\": 500,\n        \"n_bins_fine_vis\": 10,\n        \"im_vis_reduce\": 10,\n        \"grid_dim\": 256,\n        \"live_vis\": 1,\n        \"live_voxel_size\": 0.005\n    }\n}\n"
  },
  {
    "path": "configs/ScanNet/config_scannet0000_iMAP.json",
    "content": "{\n    \"dataset\": {\n        \"live\": 0,\n        \"path\": \"/home/xin/data/ScanNet/NICESLAM/scene0000_00\",\n        \"format\": \"ScanNet\",\n        \"keep_alive\": 20\n    },\n    \"optimizer\": {\n        \"args\":{\n            \"lr\": 0.001,\n            \"weight_decay\": 0.013,\n            \"pose_lr\": 0.001\n        }\n    },\n    \"trainer\": {\n        \"imap_mode\": 1,\n        \"do_bg\": 0,\n        \"n_models\": 1,\n        \"train_device\": \"cuda:0\",\n        \"data_device\": \"cuda:0\",\n        \"training_strategy\": \"vmap\",\n        \"epochs\": 1000000,\n        \"scale\": 1000.0\n    },\n    \"render\": {\n        \"depth_range\": [0.0, 6.0],\n        \"n_bins\": 9,\n        \"n_bins_cam2surface\": 5,\n        \"n_bins_cam2surface_bg\": 5,\n        \"iters_per_frame\": 20,\n        \"n_per_optim\": 2400,\n        \"n_per_optim_bg\": 1200\n    },\n    \"model\": {\n        \"n_unidir_funcs\": 5,\n        \"obj_scale\": 3.0,\n        \"bg_scale\": 15.0,\n        \"color_scaling\": 5.0,\n        \"opacity_scaling\": 10.0,\n        \"gt_scene\": 1,\n        \"surface_eps\": 0.1,\n        \"other_eps\": 0.05,\n        \"keyframe_buffer_size\": 20,\n        \"keyframe_step\": 50,\n        \"keyframe_step_bg\": 50,\n        \"window_size\": 5,\n        \"window_size_bg\": 10,\n        \"hidden_layers_block\": 1,\n        \"hidden_feature_size\": 256,\n        \"hidden_feature_size_bg\": 128\n    },\n    \"camera\": {\n        \"w\": 640,\n        \"h\": 480,\n        \"mw\": 10,\n        \"mh\": 10\n    },\n    \"vis\": {\n        \"vis_device\": \"cuda:0\",\n        \"n_vis_iter\": 500,\n        \"n_bins_fine_vis\": 10,\n        \"im_vis_reduce\": 10,\n        \"grid_dim\": 256,\n        \"live_vis\": 1,\n        \"live_voxel_size\": 0.005\n    }\n}\n"
  },
  {
    "path": "configs/ScanNet/config_scannet0000_vMAP.json",
    "content": "{\n    \"dataset\": {\n        \"live\": 0,\n        \"path\": \"/home/xin/data/ScanNet/NICESLAM/scene0000_00\",\n        \"format\": \"ScanNet\",\n        \"keep_alive\": 20\n    },\n    \"optimizer\": {\n        \"args\":{\n            \"lr\": 0.001,\n            \"weight_decay\": 0.013,\n            \"pose_lr\": 0.001\n        }\n    },\n    \"trainer\": {\n        \"imap_mode\": 0,\n        \"do_bg\": 1,\n        \"n_models\": 100,\n        \"train_device\": \"cuda:0\",\n        \"data_device\": \"cuda:0\",\n        \"training_strategy\": \"vmap\",\n        \"epochs\": 1000000,\n        \"scale\": 1000.0\n    },\n    \"render\": {\n        \"depth_range\": [0.0, 6.0],\n        \"n_bins\": 9,\n        \"n_bins_cam2surface\": 1,\n        \"n_bins_cam2surface_bg\": 5,\n        \"iters_per_frame\": 20,\n        \"n_per_optim\": 120,\n        \"n_per_optim_bg\": 1200\n    },\n    \"model\": {\n        \"n_unidir_funcs\": 5,\n        \"obj_scale\": 3.0,\n        \"bg_scale\": 10.0,\n        \"color_scaling\": 5.0,\n        \"opacity_scaling\": 10.0,\n        \"gt_scene\": 1,\n        \"surface_eps\": 0.1,\n        \"other_eps\": 0.05,\n        \"keyframe_buffer_size\": 20,\n        \"keyframe_step\": 25,\n        \"keyframe_step_bg\": 50,\n        \"window_size\": 5,\n        \"window_size_bg\": 10,\n        \"hidden_layers_block\": 1,\n        \"hidden_feature_size\": 32,\n        \"hidden_feature_size_bg\": 128\n    },\n    \"camera\": {\n        \"w\": 640,\n        \"h\": 480,\n        \"mw\": 10,\n        \"mh\": 10\n    },\n    \"vis\": {\n        \"vis_device\": \"cuda:0\",\n        \"n_vis_iter\": 10000000,\n        \"n_bins_fine_vis\": 10,\n        \"im_vis_reduce\": 10,\n        \"grid_dim\": 256,\n        \"live_vis\": 1,\n        \"live_voxel_size\": 0.005\n    }\n}\n"
  },
  {
    "path": "configs/ScanNet/config_scannet0024_iMAP.json",
    "content": "{\n    \"dataset\": {\n        \"live\": 0,\n        \"path\": \"/home/xin/data/ScanNet/obj-imap/scene0024_00\",\n        \"format\": \"ScanNet\",\n        \"keep_alive\": 20\n    },\n    \"optimizer\": {\n        \"args\":{\n            \"lr\": 0.001,\n            \"weight_decay\": 0.013,\n            \"pose_lr\": 0.001\n        }\n    },\n    \"trainer\": {\n        \"imap_mode\": 1,\n        \"do_bg\": 0,\n        \"n_models\": 1,\n        \"train_device\": \"cuda:0\",\n        \"data_device\": \"cuda:0\",\n        \"training_strategy\": \"vmap\",\n        \"epochs\": 1000000,\n        \"scale\": 1000.0\n    },\n    \"render\": {\n        \"depth_range\": [0.0, 6.0],\n        \"n_bins\": 9,\n        \"n_bins_cam2surface\": 5,\n        \"n_bins_cam2surface_bg\": 5,\n        \"iters_per_frame\": 20,\n        \"n_per_optim\": 2400,\n        \"n_per_optim_bg\": 1200\n    },\n    \"model\": {\n        \"n_unidir_funcs\": 5,\n        \"obj_scale\": 2.0,\n        \"bg_scale\": 5.0,\n        \"color_scaling\": 5.0,\n        \"opacity_scaling\": 10.0,\n        \"gt_scene\": 1,\n        \"surface_eps\": 0.1,\n        \"other_eps\": 0.05,\n        \"keyframe_buffer_size\": 20,\n        \"keyframe_step\": 50,\n        \"keyframe_step_bg\": 50,\n        \"window_size\": 5,\n        \"window_size_bg\": 10,\n        \"hidden_layers_block\": 1,\n        \"hidden_feature_size\": 256,\n        \"hidden_feature_size_bg\": 128\n    },\n    \"camera\": {\n        \"w\": 640,\n        \"h\": 480,\n        \"mw\": 10,\n        \"mh\": 10\n    },\n    \"vis\": {\n        \"vis_device\": \"cuda:0\",\n        \"n_vis_iter\": 500,\n        \"n_bins_fine_vis\": 10,\n        \"im_vis_reduce\": 10,\n        \"grid_dim\": 256,\n        \"live_vis\": 1,\n        \"live_voxel_size\": 0.005\n    }\n}\n"
  },
  {
    "path": "configs/ScanNet/config_scannet0024_vMAP.json",
    "content": "{\n    \"dataset\": {\n        \"live\": 0,\n        \"path\": \"/home/xin/data/ScanNet/obj-imap/scene0024_00\",\n        \"format\": \"ScanNet\",\n        \"keep_alive\": 20\n    },\n    \"optimizer\": {\n        \"args\":{\n            \"lr\": 0.001,\n            \"weight_decay\": 0.013,\n            \"pose_lr\": 0.001\n        }\n    },\n    \"trainer\": {\n        \"imap_mode\": 0,\n        \"do_bg\": 1,\n        \"n_models\": 100,\n        \"train_device\": \"cuda:0\",\n        \"data_device\": \"cuda:0\",\n        \"training_strategy\": \"vmap\",\n        \"epochs\": 1000000,\n        \"scale\": 1000.0\n    },\n    \"render\": {\n        \"depth_range\": [0.0, 6.0],\n        \"n_bins\": 9,\n        \"n_bins_cam2surface\": 1,\n        \"n_bins_cam2surface_bg\": 5,\n        \"iters_per_frame\": 20,\n        \"n_per_optim\": 120,\n        \"n_per_optim_bg\": 1200\n    },\n    \"model\": {\n        \"n_unidir_funcs\": 5,\n        \"obj_scale\": 3.0,\n        \"bg_scale\": 10.0,\n        \"color_scaling\": 5.0,\n        \"opacity_scaling\": 10.0,\n        \"gt_scene\": 1,\n        \"surface_eps\": 0.1,\n        \"other_eps\": 0.05,\n        \"keyframe_buffer_size\": 20,\n        \"keyframe_step\": 25,\n        \"keyframe_step_bg\": 50,\n        \"window_size\": 5,\n        \"window_size_bg\": 10,\n        \"hidden_layers_block\": 1,\n        \"hidden_feature_size\": 32,\n        \"hidden_feature_size_bg\": 128\n    },\n    \"camera\": {\n        \"w\": 640,\n        \"h\": 480,\n        \"mw\": 10,\n        \"mh\": 10\n    },\n    \"vis\": {\n        \"vis_device\": \"cuda:0\",\n        \"n_vis_iter\": 10000000,\n        \"n_bins_fine_vis\": 10,\n        \"im_vis_reduce\": 10,\n        \"grid_dim\": 256,\n        \"live_vis\": 1,\n        \"live_voxel_size\": 0.005\n    }\n}\n"
  },
  {
    "path": "data_generation/README.md",
    "content": "## Replica Data Generation\n\n### Download Replica Dataset\nDownload 3D models and info files from [Replica](https://github.com/facebookresearch/Replica-Dataset)\n\n### 3D Object Mesh Extraction\nChange the input path in `./data_generation/extract_inst_obj.py` and run\n```angular2html\npython ./data_generation/extract_inst_obj.py\n```\n\n### Camera Trajectory Generation\nPlease refer to [Semantic-NeRF](https://github.com/Harry-Zhi/semantic_nerf/issues/25#issuecomment-1340595427) for more details. The random trajectory generation only works for single room scene. For multiple rooms scene, collision checking is needed. Welcome contributions.\n\n### Rendering 2D Images\nGiven camera trajectory t_wc (change pose_file in configs), we use [Habitat-Sim](https://github.com/facebookresearch/habitat-sim) to render RGB, Depth, Semantic and Instance images.\n\n#### Install Habitat-Sim 0.2.1\nWe recommend to use conda to install habitat-sim 0.2.1.\n```angular2html\nconda create -n habitat python=3.8.12 cmake=3.14.0\nconda activate habitat\nconda install habitat-sim=0.2.1 withbullet -c conda-forge -c aihabitat \nconda install numba=0.54.1\n```\n\n#### Run rendering with configs\n```angular2html\npython ./data_generation/habitat_renderer.py --config ./data_generation/replica_render_config_vMAP.yaml \n```\nNote that to get HDR img, use mesh.ply not semantic_mesh.ply. Change path in configs. And copy rgb folder to replace previous high exposure rgb.\n```angular2html\npython ./data_generation/habitat_renderer.py --config ./data_generation/replica_render_config_vMAP.yaml \n```"
  },
  {
    "path": "data_generation/extract_inst_obj.py",
    "content": "# reference https://github.com/facebookresearch/Replica-Dataset/issues/17#issuecomment-538757418\n\nfrom plyfile import *\nimport numpy as np\nimport trimesh\n\n\n# path_in = 'path/to/mesh_semantic.ply'\npath_in = '/home/xin/data/vmap/room_0_debug/habitat/mesh_semantic.ply'\n\nprint(\"Reading input...\")\nmesh = trimesh.load(path_in)\n# mesh.show()\nfile_in = PlyData.read(path_in)\nvertices_in = file_in.elements[0]\nfaces_in = file_in.elements[1]\n\nprint(\"Filtering data...\")\nobjects = {}\nsub_mesh_indices = {}\nfor i, f in enumerate(faces_in):\n     object_id = f[1]\n     if not object_id in objects:\n         objects[object_id] = []\n         sub_mesh_indices[object_id] = []\n     objects[object_id].append((f[0],))\n     sub_mesh_indices[object_id].append(i)\n     sub_mesh_indices[object_id].append(i+faces_in.data.shape[0])\n\n\nprint(\"Writing data...\")\nfor object_id, faces in objects.items():\n    path_out = path_in + f\"_{object_id}.ply\"\n    # print(\"sub_mesh_indices[object_id] \", sub_mesh_indices[object_id])\n    obj_mesh = mesh.submesh([sub_mesh_indices[object_id]], append=True)\n    in_n = len(sub_mesh_indices[object_id])\n    out_n = obj_mesh.faces.shape[0]\n    # print(\"obj id \", object_id)\n    # print(\"in_n \", in_n)\n    # print(\"out_n \", out_n)\n    # print(\"faces \", len(faces))\n    # assert in_n == out_n\n    obj_mesh.export(path_out)\n    # faces_out = PlyElement.describe(np.array(faces, dtype=[('vertex_indices', 'O')]), 'face')\n    # print(\"faces out \", len(PlyData([vertices_in, faces_out]).elements[1].data))\n    # PlyData([vertices_in, faces_out]).write(path_out+\"_cmp.ply\")\n\n"
  },
  {
    "path": "data_generation/habitat_renderer.py",
    "content": "#!/usr/bin/env python3\nimport os, sys, argparse\nos.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\nimport cv2\nimport logging\nimport habitat_sim as hs\nimport numpy as np\nimport quaternion\nimport yaml\nimport json\nfrom typing import Any, Dict, List, Tuple, Union\nfrom imgviz import label_colormap\nfrom PIL import Image\nimport matplotlib.pyplot as plt\nimport transformation\nimport imgviz\nfrom datetime import datetime\nimport time\nfrom settings import make_cfg\n\n# Custom type definitions\nConfig = Dict[str, Any]\nObservation = hs.sensor.Observation\nSim = hs.Simulator\n\ndef init_habitat(config) :\n    \"\"\"Initialize the Habitat simulator with sensors and scene file\"\"\"\n    _cfg = make_cfg(config)\n    sim = Sim(_cfg)\n    sim_cfg = hs.SimulatorConfiguration()\n    sim_cfg.gpu_device_id = 0\n    # Note: all sensors must have the same resolution\n    camera_resolution = [config[\"height\"], config[\"width\"]]\n    sensors = {\n        \"color_sensor\": {\n            \"sensor_type\": hs.SensorType.COLOR,\n            \"resolution\": camera_resolution,\n            \"position\": [0.0, config[\"sensor_height\"], 0.0],\n        },\n        \"depth_sensor\": {\n            \"sensor_type\": hs.SensorType.DEPTH,\n            \"resolution\": camera_resolution,\n            \"position\": [0.0, config[\"sensor_height\"], 0.0],\n        },\n        \"semantic_sensor\": {\n            \"sensor_type\": hs.SensorType.SEMANTIC,\n            \"resolution\": camera_resolution,\n            \"position\": [0.0, config[\"sensor_height\"], 0.0],\n        },\n    }\n\n    sensor_specs = []\n    for sensor_uuid, sensor_params in sensors.items():\n        if config[sensor_uuid]:\n            sensor_spec = hs.SensorSpec()\n            sensor_spec.uuid = sensor_uuid\n            sensor_spec.sensor_type = sensor_params[\"sensor_type\"]\n            sensor_spec.resolution = sensor_params[\"resolution\"]\n            sensor_spec.position = sensor_params[\"position\"]\n\n            sensor_specs.append(sensor_spec)\n\n    # Here you can specify the amount of displacement in a forward action and the turn angle\n    agent_cfg = hs.agent.AgentConfiguration()\n    agent_cfg.sensor_specifications = sensor_specs\n    agent_cfg.action_space = {\n        \"move_forward\": hs.agent.ActionSpec(\n            \"move_forward\", hs.agent.ActuationSpec(amount=0.25)\n        ),\n        \"turn_left\": hs.agent.ActionSpec(\n            \"turn_left\", hs.agent.ActuationSpec(amount=30.0)\n        ),\n        \"turn_right\": hs.agent.ActionSpec(\n            \"turn_right\", hs.agent.ActuationSpec(amount=30.0)\n        ),\n    }\n\n    hs_cfg = hs.Configuration(sim_cfg, [agent_cfg])\n    # sim = Sim(hs_cfg)\n\n    if config[\"enable_semantics\"]: # extract instance to class mapping function\n        assert os.path.exists(config[\"instance2class_mapping\"])\n        with open(config[\"instance2class_mapping\"], \"r\") as f:\n            annotations = json.load(f)\n        instance_id_to_semantic_label_id = np.array(annotations[\"id_to_label\"])\n        num_classes = len(annotations[\"classes\"])\n        label_colour_map = label_colormap()\n        config[\"instance2semantic\"] = instance_id_to_semantic_label_id\n        config[\"classes\"] = annotations[\"classes\"]\n        config[\"objects\"] = annotations[\"objects\"]\n\n        config[\"num_classes\"] = num_classes\n        config[\"label_colour_map\"] = label_colormap()\n        config[\"instance_colour_map\"] = label_colormap(500)\n\n\n    # add camera intrinsic\n    # hfov = float(agent_cfg.sensor_specifications[0].parameters['hfov']) * np.pi / 180.\n    # https://aihabitat.org/docs/habitat-api/view-transform-warp.html\n    # config['K'] = K\n    # config['K'] = np.array([[fx, 0.0, 0.0], [0.0, fx, 0.0], [0.0, 0.0, 1.0]],\n    #                        dtype=np.float64)\n\n    # hfov = float(agent_cfg.sensor_specifications[0].parameters['hfov'])\n    # fx = 1.0 / np.tan(hfov / 2.0)\n    # config['K'] = np.array([[fx, 0.0, 0.0], [0.0, fx, 0.0], [0.0, 0.0, 1.0]],\n    #                        dtype=np.float64)\n\n    # Get the intrinsic camera parameters\n\n\n    logging.info('Habitat simulator initialized')\n\n    return sim, hs_cfg, config\n\ndef save_renders(save_path, observation, enable_semantic, suffix=\"\"):\n    save_path_rgb = os.path.join(save_path, \"rgb\")\n    save_path_depth = os.path.join(save_path, \"depth\")\n    save_path_sem_class = os.path.join(save_path, \"semantic_class\")\n    save_path_sem_instance = os.path.join(save_path, \"semantic_instance\")\n\n    if not os.path.exists(save_path_rgb):\n        os.makedirs(save_path_rgb)\n    if not os.path.exists(save_path_depth):\n        os.makedirs(save_path_depth)\n    if not os.path.exists(save_path_sem_class):\n        os.makedirs(save_path_sem_class)\n    if not os.path.exists(save_path_sem_instance):\n        os.makedirs(save_path_sem_instance)\n\n    cv2.imwrite(os.path.join(save_path_rgb, \"rgb{}.png\".format(suffix)), observation[\"color_sensor\"][:,:,::-1])  # change from RGB to BGR for opencv write\n    cv2.imwrite(os.path.join(save_path_depth, \"depth{}.png\".format(suffix)), observation[\"depth_sensor_mm\"])\n\n    if enable_semantic:\n        cv2.imwrite(os.path.join(save_path_sem_class, \"semantic_class{}.png\".format(suffix)), observation[\"semantic_class\"])\n        cv2.imwrite(os.path.join(save_path_sem_class, \"vis_sem_class{}.png\".format(suffix)), observation[\"vis_sem_class\"][:,:,::-1])\n\n        cv2.imwrite(os.path.join(save_path_sem_instance, \"semantic_instance{}.png\".format(suffix)), observation[\"semantic_instance\"])\n        cv2.imwrite(os.path.join(save_path_sem_instance, \"vis_sem_instance{}.png\".format(suffix)), observation[\"vis_sem_instance\"][:,:,::-1])\n\n\ndef render(sim, config):\n    \"\"\"Return the sensor observations and ground truth pose\"\"\"\n    observation = sim.get_sensor_observations()\n\n    # process rgb imagem change from RGBA to RGB\n    observation['color_sensor'] = observation['color_sensor'][..., 0:3]\n    rgb_img = observation['color_sensor']\n\n    # process depth\n    depth_mm = (observation['depth_sensor'].copy()*1000).astype(np.uint16)  # change meters to mm\n    observation['depth_sensor_mm'] = depth_mm\n\n    # process semantics\n    if config['enable_semantics']:\n\n        # Assuming the scene has no more than 65534 objects\n        observation['semantic_instance'] = np.clip(observation['semantic_sensor'].astype(np.uint16), 0, 65535)\n        # observation['semantic_instance'][observation['semantic_instance']==12]=0 # mask out certain instance\n        # Convert instance IDs to class IDs\n\n\n        # observation['semantic_classes'] = np.zeros(observation['semantic'].shape, dtype=np.uint8)\n        # TODO make this conversion more efficient\n        semantic_class = config[\"instance2semantic\"][observation['semantic_instance']]\n        semantic_class[semantic_class < 0] = 0\n\n        vis_sem_class = config[\"label_colour_map\"][semantic_class]\n        vis_sem_instance = config[\"instance_colour_map\"][observation['semantic_instance']]  # may cause error when having more than 255 instances in the scene\n\n        observation['semantic_class'] = semantic_class.astype(np.uint8)\n        observation[\"vis_sem_class\"] = vis_sem_class.astype(np.uint8)\n        observation[\"vis_sem_instance\"] = vis_sem_instance.astype(np.uint8)\n\n        # del observation[\"semantic_sensor\"]\n\n    # Get the camera ground truth pose (T_HC) in the habitat frame from the\n    # position and orientation\n    t_HC = sim.get_agent(0).get_state().position\n    q_HC = sim.get_agent(0).get_state().rotation\n    T_HC = transformation.combine_pose(t_HC, q_HC)\n\n    observation['T_HC'] = T_HC\n    observation['T_WC'] = transformation.Thc_to_Twc(T_HC)\n\n    return observation\n\ndef set_agent_position(sim, pose):\n    # Move the agent\n    R = pose[:3, :3]\n    orientation_quat = quaternion.from_rotation_matrix(R)\n    t = pose[:3, 3]\n    position = t\n\n    orientation = [orientation_quat.x, orientation_quat.y, orientation_quat.z, orientation_quat.w]\n    agent = sim.get_agent(0)\n    agent_state = hs.agent.AgentState(position, orientation)\n    # agent.set_state(agent_state, reset_sensors=False)\n    agent.set_state(agent_state)\n\ndef main():\n    parser = argparse.ArgumentParser(description='Render Colour, Depth, Semantic, Instance labeling from Habitat-Simultation.')\n    parser.add_argument('--config_file', type=str,\n                        default=\"./data_generation/replica_render_config_vMAP.yaml\",\n                        help='the path to custom config file.')\n    args = parser.parse_args()\n\n    \"\"\"Initialize the config dict and Habitat simulator\"\"\"\n    # Read YAML file\n    with open(args.config_file, 'r') as f:\n        config = yaml.safe_load(f)\n\n    config[\"save_path\"] = os.path.join(config[\"save_path\"])\n    if not os.path.exists(config[\"save_path\"]):\n        os.makedirs(config[\"save_path\"])\n\n    T_wc = np.loadtxt(config[\"pose_file\"]).reshape(-1, 4, 4)\n    Ts_cam2world = T_wc\n\n    print(\"-----Initialise and Set Habitat-Sim-----\")\n    sim, hs_cfg, config = init_habitat(config)\n    # Set agent state\n    sim.initialize_agent(config[\"default_agent\"])\n\n    \"\"\"Set agent state\"\"\"\n    print(\"-----Render Images from Habitat-Sim-----\")\n    with open(os.path.join(config[\"save_path\"], 'render_config.yaml'), 'w') as outfile:\n            yaml.dump(config, outfile, default_flow_style=False)\n    start_time = time.time()\n    total_render_num = Ts_cam2world.shape[0]\n    for i in range(total_render_num):\n        if i % 100 == 0 :\n            print(\"Rendering Process: {}/{}\".format(i, total_render_num))\n        set_agent_position(sim, transformation.Twc_to_Thc(Ts_cam2world[i]))\n\n        # replica mode\n        observation = render(sim, config)\n        save_renders(config[\"save_path\"], observation, config[\"enable_semantics\"], suffix=\"_{}\".format(i))\n\n    end_time = time.time()\n    print(\"-----Finish Habitat Rendering, Showing Trajectories.-----\")\n    print(\"Average rendering time per image is {} seconds.\".format((end_time-start_time)/Ts_cam2world.shape[0]))\n\nif __name__ == \"__main__\":\n    main()\n\n\n"
  },
  {
    "path": "data_generation/replica_render_config_vMAP.yaml",
    "content": "# Agent settings\ndefault_agent: 0\ngpu_id: 0\nwidth: 1200 #1280\nheight: 680 #960\nsensor_height: 0\n\ncolor_sensor: true  # RGB sensor\nsemantic_sensor: true # Semantic sensor\ndepth_sensor: true  # Depth sensor\nenable_semantics: true\n\n# room_0\nscene_file: \"/home/xin/data/vmap/room_0/habitat/mesh_semantic.ply\"\ninstance2class_mapping: \"/home/xin/data/vmap/room_0/habitat/info_semantic.json\"\nsave_path: \"/home/xin/data/vmap/room_0/vmap/00/\"\npose_file: \"/home/xin/data/vmap/room_0/vmap/00/traj_w_c.txt\"\n## HDR texture\n## issue https://github.com/facebookresearch/Replica-Dataset/issues/41#issuecomment-566251467\n#scene_file: \"/home/xin/data/vmap/room_0/mesh.ply\"\n#instance2class_mapping: \"/home/xin/data/vmap/room_0/habitat/info_semantic.json\"\n#save_path: \"/home/xin/data/vmap/room_0/vmap/00/\"\n#pose_file: \"/home/xin/data/vmap/room_0/vmap/00/traj_w_c.txt\""
  },
  {
    "path": "data_generation/settings.py",
    "content": "# Copyright (c) Facebook, Inc. and its affiliates.\n# This source code is licensed under the MIT license found in the\n# LICENSE file in the root directory of this source tree.\n\nimport habitat_sim\nimport habitat_sim.agent\n\ndefault_sim_settings = {\n    # settings shared by example.py and benchmark.py\n    \"max_frames\": 1000,\n    \"width\": 640,\n    \"height\": 480,\n    \"default_agent\": 0,\n    \"sensor_height\": 1.5,\n    \"hfov\": 90,\n    \"color_sensor\": True,  # RGB sensor (default: ON)\n    \"semantic_sensor\": False,  # semantic sensor (default: OFF)\n    \"depth_sensor\": False,  # depth sensor (default: OFF)\n    \"ortho_rgba_sensor\": False,  # Orthographic RGB sensor (default: OFF)\n    \"ortho_depth_sensor\": False,  # Orthographic depth sensor (default: OFF)\n    \"ortho_semantic_sensor\": False,  # Orthographic semantic sensor (default: OFF)\n    \"fisheye_rgba_sensor\": False,\n    \"fisheye_depth_sensor\": False,\n    \"fisheye_semantic_sensor\": False,\n    \"equirect_rgba_sensor\": False,\n    \"equirect_depth_sensor\": False,\n    \"equirect_semantic_sensor\": False,\n    \"seed\": 1,\n    \"silent\": False,  # do not print log info (default: OFF)\n    # settings exclusive to example.py\n    \"save_png\": False,  # save the pngs to disk (default: OFF)\n    \"print_semantic_scene\": False,\n    \"print_semantic_mask_stats\": False,\n    \"compute_shortest_path\": False,\n    \"compute_action_shortest_path\": False,\n    \"scene\": \"data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\",\n    \"test_scene_data_url\": \"http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip\",\n    \"goal_position\": [5.047, 0.199, 11.145],\n    \"enable_physics\": False,\n    \"enable_gfx_replay_save\": False,\n    \"physics_config_file\": \"./data/default.physics_config.json\",\n    \"num_objects\": 10,\n    \"test_object_index\": 0,\n    \"frustum_culling\": True,\n}\n\n# build SimulatorConfiguration\ndef make_cfg(settings):\n    sim_cfg = habitat_sim.SimulatorConfiguration()\n    if \"frustum_culling\" in settings:\n        sim_cfg.frustum_culling = settings[\"frustum_culling\"]\n    else:\n        sim_cfg.frustum_culling = False\n    if \"enable_physics\" in settings:\n        sim_cfg.enable_physics = settings[\"enable_physics\"]\n    if \"physics_config_file\" in settings:\n        sim_cfg.physics_config_file = settings[\"physics_config_file\"]\n    # if not settings[\"silent\"]:\n    #     print(\"sim_cfg.physics_config_file = \" + sim_cfg.physics_config_file)\n    if \"scene_light_setup\" in settings:\n        sim_cfg.scene_light_setup = settings[\"scene_light_setup\"]\n    sim_cfg.gpu_device_id = 0\n    if not hasattr(sim_cfg, \"scene_id\"):\n        raise RuntimeError(\n            \"Error: Please upgrade habitat-sim. SimulatorConfig API version mismatch\"\n        )\n    sim_cfg.scene_id = settings[\"scene_file\"]\n\n    # define default sensor parameters (see src/esp/Sensor/Sensor.h)\n    sensor_specs = []\n\n    def create_camera_spec(**kw_args):\n        camera_sensor_spec = habitat_sim.CameraSensorSpec()\n        camera_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n        camera_sensor_spec.resolution = [settings[\"height\"], settings[\"width\"]]\n        camera_sensor_spec.position = [0, settings[\"sensor_height\"], 0]\n        for k in kw_args:\n            setattr(camera_sensor_spec, k, kw_args[k])\n        return camera_sensor_spec\n\n    if settings[\"color_sensor\"]:\n        color_sensor_spec = create_camera_spec(\n            uuid=\"color_sensor\",\n            # hfov=settings[\"hfov\"],\n            sensor_type=habitat_sim.SensorType.COLOR,\n            sensor_subtype=habitat_sim.SensorSubType.PINHOLE,\n        )\n        sensor_specs.append(color_sensor_spec)\n\n    if settings[\"depth_sensor\"]:\n        depth_sensor_spec = create_camera_spec(\n            uuid=\"depth_sensor\",\n            # hfov=settings[\"hfov\"],\n            sensor_type=habitat_sim.SensorType.DEPTH,\n            channels=1,\n            sensor_subtype=habitat_sim.SensorSubType.PINHOLE,\n        )\n        sensor_specs.append(depth_sensor_spec)\n\n    if settings[\"semantic_sensor\"]:\n        semantic_sensor_spec = create_camera_spec(\n            uuid=\"semantic_sensor\",\n            # hfov=settings[\"hfov\"],\n            sensor_type=habitat_sim.SensorType.SEMANTIC,\n            channels=1,\n            sensor_subtype=habitat_sim.SensorSubType.PINHOLE,\n        )\n        sensor_specs.append(semantic_sensor_spec)\n\n    # if settings[\"ortho_rgba_sensor\"]:\n    #     ortho_rgba_sensor_spec = create_camera_spec(\n    #         uuid=\"ortho_rgba_sensor\",\n    #         sensor_type=habitat_sim.SensorType.COLOR,\n    #         sensor_subtype=habitat_sim.SensorSubType.ORTHOGRAPHIC,\n    #     )\n    #     sensor_specs.append(ortho_rgba_sensor_spec)\n    #\n    # if settings[\"ortho_depth_sensor\"]:\n    #     ortho_depth_sensor_spec = create_camera_spec(\n    #         uuid=\"ortho_depth_sensor\",\n    #         sensor_type=habitat_sim.SensorType.DEPTH,\n    #         channels=1,\n    #         sensor_subtype=habitat_sim.SensorSubType.ORTHOGRAPHIC,\n    #     )\n    #     sensor_specs.append(ortho_depth_sensor_spec)\n    #\n    # if settings[\"ortho_semantic_sensor\"]:\n    #     ortho_semantic_sensor_spec = create_camera_spec(\n    #         uuid=\"ortho_semantic_sensor\",\n    #         sensor_type=habitat_sim.SensorType.SEMANTIC,\n    #         channels=1,\n    #         sensor_subtype=habitat_sim.SensorSubType.ORTHOGRAPHIC,\n    #     )\n    #     sensor_specs.append(ortho_semantic_sensor_spec)\n\n    # TODO Figure out how to implement copying of specs\n    def create_fisheye_spec(**kw_args):\n        fisheye_sensor_spec = habitat_sim.FisheyeSensorDoubleSphereSpec()\n        fisheye_sensor_spec.uuid = \"fisheye_sensor\"\n        fisheye_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n        fisheye_sensor_spec.sensor_model_type = (\n            habitat_sim.FisheyeSensorModelType.DOUBLE_SPHERE\n        )\n\n        # The default value (alpha, xi) is set to match the lens \"GoPro\" found in Table 3 of this paper:\n        # Vladyslav Usenko, Nikolaus Demmel and Daniel Cremers: The Double Sphere\n        # Camera Model, The International Conference on 3D Vision (3DV), 2018\n        # You can find the intrinsic parameters for the other lenses in the same table as well.\n        fisheye_sensor_spec.xi = -0.27\n        fisheye_sensor_spec.alpha = 0.57\n        fisheye_sensor_spec.focal_length = [364.84, 364.86]\n\n        fisheye_sensor_spec.resolution = [settings[\"height\"], settings[\"width\"]]\n        # The default principal_point_offset is the middle of the image\n        fisheye_sensor_spec.principal_point_offset = None\n        # default: fisheye_sensor_spec.principal_point_offset = [i/2 for i in fisheye_sensor_spec.resolution]\n        fisheye_sensor_spec.position = [0, settings[\"sensor_height\"], 0]\n        for k in kw_args:\n            setattr(fisheye_sensor_spec, k, kw_args[k])\n        return fisheye_sensor_spec\n\n    # if settings[\"fisheye_rgba_sensor\"]:\n    #     fisheye_rgba_sensor_spec = create_fisheye_spec(uuid=\"fisheye_rgba_sensor\")\n    #     sensor_specs.append(fisheye_rgba_sensor_spec)\n    # if settings[\"fisheye_depth_sensor\"]:\n    #     fisheye_depth_sensor_spec = create_fisheye_spec(\n    #         uuid=\"fisheye_depth_sensor\",\n    #         sensor_type=habitat_sim.SensorType.DEPTH,\n    #         channels=1,\n    #     )\n    #     sensor_specs.append(fisheye_depth_sensor_spec)\n    # if settings[\"fisheye_semantic_sensor\"]:\n    #     fisheye_semantic_sensor_spec = create_fisheye_spec(\n    #         uuid=\"fisheye_semantic_sensor\",\n    #         sensor_type=habitat_sim.SensorType.SEMANTIC,\n    #         channels=1,\n    #     )\n    #     sensor_specs.append(fisheye_semantic_sensor_spec)\n\n    def create_equirect_spec(**kw_args):\n        equirect_sensor_spec = habitat_sim.EquirectangularSensorSpec()\n        equirect_sensor_spec.uuid = \"equirect_rgba_sensor\"\n        equirect_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n        equirect_sensor_spec.resolution = [settings[\"height\"], settings[\"width\"]]\n        equirect_sensor_spec.position = [0, settings[\"sensor_height\"], 0]\n        for k in kw_args:\n            setattr(equirect_sensor_spec, k, kw_args[k])\n        return equirect_sensor_spec\n\n    # if settings[\"equirect_rgba_sensor\"]:\n    #     equirect_rgba_sensor_spec = create_equirect_spec(uuid=\"equirect_rgba_sensor\")\n    #     sensor_specs.append(equirect_rgba_sensor_spec)\n    #\n    # if settings[\"equirect_depth_sensor\"]:\n    #     equirect_depth_sensor_spec = create_equirect_spec(\n    #         uuid=\"equirect_depth_sensor\",\n    #         sensor_type=habitat_sim.SensorType.DEPTH,\n    #         channels=1,\n    #     )\n    #     sensor_specs.append(equirect_depth_sensor_spec)\n    #\n    # if settings[\"equirect_semantic_sensor\"]:\n    #     equirect_semantic_sensor_spec = create_equirect_spec(\n    #         uuid=\"equirect_semantic_sensor\",\n    #         sensor_type=habitat_sim.SensorType.SEMANTIC,\n    #         channels=1,\n    #     )\n    #     sensor_specs.append(equirect_semantic_sensor_spec)\n\n    # create agent specifications\n    agent_cfg = habitat_sim.agent.AgentConfiguration()\n    agent_cfg.sensor_specifications = sensor_specs\n    agent_cfg.action_space = {\n        \"move_forward\": habitat_sim.agent.ActionSpec(\n            \"move_forward\", habitat_sim.agent.ActuationSpec(amount=0.25)\n        ),\n        \"turn_left\": habitat_sim.agent.ActionSpec(\n            \"turn_left\", habitat_sim.agent.ActuationSpec(amount=10.0)\n        ),\n        \"turn_right\": habitat_sim.agent.ActionSpec(\n            \"turn_right\", habitat_sim.agent.ActuationSpec(amount=10.0)\n        ),\n    }\n\n    # override action space to no-op to test physics\n    if sim_cfg.enable_physics:\n        agent_cfg.action_space = {\n            \"move_forward\": habitat_sim.agent.ActionSpec(\n                \"move_forward\", habitat_sim.agent.ActuationSpec(amount=0.0)\n            )\n        }\n\n    return habitat_sim.Configuration(sim_cfg, [agent_cfg])\n"
  },
  {
    "path": "data_generation/transformation.py",
    "content": "import numpy as np\nimport quaternion\nimport trimesh\n\ndef habitat_world_transformations():\n    import habitat_sim\n    # Transforms between the habitat frame H (y-up) and the world frame W (z-up).\n    T_wh = np.identity(4)\n\n    # https://stackoverflow.com/questions/1171849/finding-quaternion-representing-the-rotation-from-one-vector-to-another\n    T_wh[0:3, 0:3] = quaternion.as_rotation_matrix(habitat_sim.utils.common.quat_from_two_vectors(\n            habitat_sim.geo.GRAVITY, np.array([0.0, 0.0, -1.0])))\n\n    T_hw = np.linalg.inv(T_wh)\n\n    return T_wh, T_hw\n\ndef opencv_to_opengl_camera(transform=None):\n    if transform is None:\n        transform = np.eye(4)\n    return transform @ trimesh.transformations.rotation_matrix(\n        np.deg2rad(180), [1, 0, 0]\n    )\n\ndef opengl_to_opencv_camera(transform=None):\n    if transform is None:\n        transform = np.eye(4)\n    return transform @ trimesh.transformations.rotation_matrix(\n        np.deg2rad(-180), [1, 0, 0]\n    )\n\ndef Twc_to_Thc(T_wc):  # opencv-camera to world transformation ---> habitat-caemra to habitat world transformation\n    T_wh, T_hw = habitat_world_transformations()\n    T_hc = T_hw @ T_wc @ opengl_to_opencv_camera()\n    return T_hc\n\n\ndef Thc_to_Twc(T_hc):  # habitat-caemra to habitat world transformation --->  opencv-camera to world transformation\n    T_wh, T_hw = habitat_world_transformations()\n    T_wc = T_wh @ T_hc @ opencv_to_opengl_camera()\n    return T_wc\n\n\ndef combine_pose(t: np.array, q: quaternion.quaternion) -> np.array:\n    T = np.identity(4)\n    T[0:3, 3] = t\n    T[0:3, 0:3] = quaternion.as_rotation_matrix(q)\n    return T"
  },
  {
    "path": "dataset.py",
    "content": "import imgviz\nfrom torch.utils.data import Dataset, DataLoader\nimport torch\nimport numpy as np\nimport cv2\nimport os\nfrom utils import enlarge_bbox, get_bbox2d, get_bbox2d_batch, box_filter\nimport glob\nfrom torchvision import transforms\nimport image_transforms\nimport open3d\nimport time\n\ndef next_live_data(track_to_map_IDT, inited):\n    while True:\n        if track_to_map_IDT.empty():\n            if inited:\n                return None  # no new frame, use kf buffer\n            else:   # blocking until get the first frame\n                continue\n        else:\n            Buffer_data = track_to_map_IDT.get(block=False)\n            break\n\n\n    if Buffer_data is not None:\n        image, depth, T, obj, bbox_dict, kf_id = Buffer_data\n        del Buffer_data\n        T_obj = torch.eye(4)\n        sample = {\"image\": image, \"depth\": depth, \"T\": T, \"T_obj\": T_obj,\n                  \"obj\": obj, \"bbox_dict\": bbox_dict, \"frame_id\": kf_id}\n\n        return sample\n    else:\n        print(\"getting nothing?\")\n        exit(-1)\n        # return None\n\ndef init_loader(cfg, multi_worker=True):\n    if cfg.dataset_format == \"Replica\":\n        dataset = Replica(cfg)\n    elif cfg.dataset_format == \"ScanNet\":\n        dataset = ScanNet(cfg)\n    else:\n        print(\"Dataset format {} not found\".format(cfg.dataset_format))\n        exit(-1)\n\n    # init dataloader\n    if multi_worker:\n        # multi worker loader\n        dataloader = DataLoader(dataset, batch_size=None, shuffle=False, sampler=None,\n                                batch_sampler=None, num_workers=4, collate_fn=None,\n                                pin_memory=True, drop_last=False, timeout=0,\n                                worker_init_fn=None, generator=None, prefetch_factor=2,\n                                persistent_workers=True)\n    else:\n        # single worker loader\n        dataloader = DataLoader(dataset, batch_size=None, shuffle=False, sampler=None,\n                                     batch_sampler=None, num_workers=0)\n\n    return dataloader\n\nclass Replica(Dataset):\n    def __init__(self, cfg):\n        self.imap_mode = cfg.imap_mode\n        self.root_dir = cfg.dataset_dir\n        traj_file = os.path.join(self.root_dir, \"traj_w_c.txt\")\n        self.Twc = np.loadtxt(traj_file, delimiter=\" \").reshape([-1, 4, 4])\n        self.depth_transform = transforms.Compose(\n            [image_transforms.DepthScale(cfg.depth_scale),\n             image_transforms.DepthFilter(cfg.max_depth)])\n\n        # background semantic classes: undefined--1, undefined-0 beam-5 blinds-12 curtain-30 ceiling-31 floor-40 pillar-60 vent-92 wall-93 wall-plug-95 window-97 rug-98\n        self.background_cls_list = [5,12,30,31,40,60,92,93,95,97,98,79]\n        # Not sure: door-37 handrail-43 lamp-47 pipe-62 rack-66 shower-stall-73 stair-77 switch-79 wall-cabinet-94 picture-59\n        self.bbox_scale = 0.2  # 1 #1.5 0.9== s=1/9, s=0.2\n\n    def __len__(self):\n        return len(os.listdir(os.path.join(self.root_dir, \"depth\")))\n\n    def __getitem__(self, idx):\n        bbox_dict = {}\n        rgb_file = os.path.join(self.root_dir, \"rgb\", \"rgb_\" + str(idx) + \".png\")\n        depth_file = os.path.join(self.root_dir, \"depth\", \"depth_\" + str(idx) + \".png\")\n        inst_file = os.path.join(self.root_dir, \"semantic_instance\", \"semantic_instance_\" + str(idx) + \".png\")\n        obj_file = os.path.join(self.root_dir, \"semantic_class\", \"semantic_class_\" + str(idx) + \".png\")\n        depth = cv2.imread(depth_file, -1).astype(np.float32).transpose(1,0)\n        image = cv2.imread(rgb_file).astype(np.uint8)\n        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).transpose(1,0,2)\n        obj = cv2.imread(obj_file, cv2.IMREAD_UNCHANGED).astype(np.int32).transpose(1,0)   # uint16 -> int32\n        inst = cv2.imread(inst_file, cv2.IMREAD_UNCHANGED).astype(np.int32).transpose(1,0)  # uint16 -> int32\n\n        bbox_scale = self.bbox_scale\n\n        if self.imap_mode:\n            obj = np.zeros_like(obj)\n        else:\n            obj_ = np.zeros_like(obj)\n            inst_list = []\n            batch_masks = []\n            for inst_id in np.unique(inst):\n                inst_mask = inst == inst_id\n                # if np.sum(inst_mask) <= 2000: # too small    20  400\n                #     continue\n                sem_cls = np.unique(obj[inst_mask])  # sem label, only interested obj\n                assert sem_cls.shape[0] != 0\n                if sem_cls in self.background_cls_list:\n                    continue\n                obj_mask = inst == inst_id\n                batch_masks.append(obj_mask)\n                inst_list.append(inst_id)\n            if len(batch_masks) > 0:\n                batch_masks = torch.from_numpy(np.stack(batch_masks))\n                cmins, cmaxs, rmins, rmaxs = get_bbox2d_batch(batch_masks)\n\n                for i in range(batch_masks.shape[0]):\n                    w = rmaxs[i] - rmins[i]\n                    h = cmaxs[i] - cmins[i]\n                    if w <= 10 or h <= 10:  # too small   todo\n                        continue\n                    bbox_enlarged = enlarge_bbox([rmins[i], cmins[i], rmaxs[i], cmaxs[i]], scale=bbox_scale,\n                                                 w=obj.shape[1], h=obj.shape[0])\n                    # inst_list.append(inst_id)\n                    inst_id = inst_list[i]\n                    obj_[batch_masks[i]] = 1\n                    # bbox_dict.update({int(inst_id): torch.from_numpy(np.array(bbox_enlarged).reshape(-1))}) # batch format\n                    bbox_dict.update({inst_id: torch.from_numpy(np.array(\n                        [bbox_enlarged[1], bbox_enlarged[3], bbox_enlarged[0], bbox_enlarged[2]]))})  # bbox order\n\n            inst[obj_ == 0] = 0  # for background\n            obj = inst\n\n        bbox_dict.update({0: torch.from_numpy(np.array([int(0), int(obj.shape[0]), 0, int(obj.shape[1])]))})  # bbox order\n\n        T = self.Twc[idx]   # could change to ORB-SLAM pose or else\n        T_obj = np.eye(4)   # obj pose, if dynamic\n        sample = {\"image\": image, \"depth\": depth, \"T\": T, \"T_obj\": T_obj,\n                  \"obj\": obj, \"bbox_dict\": bbox_dict, \"frame_id\": idx}\n\n        if image is None or depth is None:\n            print(rgb_file)\n            print(depth_file)\n            raise ValueError\n\n        if self.depth_transform:\n            sample[\"depth\"] = self.depth_transform(sample[\"depth\"])\n\n        return sample\n\nclass ScanNet(Dataset):\n    def __init__(self, cfg):\n        self.imap_mode = cfg.imap_mode\n        self.root_dir = cfg.dataset_dir\n        self.color_paths = sorted(glob.glob(os.path.join(\n            self.root_dir, 'color', '*.jpg')), key=lambda x: int(os.path.basename(x)[:-4]))\n        self.depth_paths = sorted(glob.glob(os.path.join(\n            self.root_dir, 'depth', '*.png')), key=lambda x: int(os.path.basename(x)[:-4]))\n        self.inst_paths = sorted(glob.glob(os.path.join(\n            self.root_dir, 'instance-filt', '*.png')), key=lambda x: int(os.path.basename(x)[:-4])) # instance-filt\n        self.sem_paths = sorted(glob.glob(os.path.join(\n            self.root_dir, 'label-filt', '*.png')), key=lambda x: int(os.path.basename(x)[:-4]))  # label-filt\n        self.load_poses(os.path.join(self.root_dir, 'pose'))\n        self.n_img = len(self.color_paths)\n        self.depth_transform = transforms.Compose(\n            [image_transforms.DepthScale(cfg.depth_scale),\n             image_transforms.DepthFilter(cfg.max_depth)])\n        # self.rgb_transform = rgb_transform\n        self.W = cfg.W\n        self.H = cfg.H\n        self.fx = cfg.fx\n        self.fy = cfg.fy\n        self.cx = cfg.cx\n        self.cy = cfg.cy\n        self.edge = cfg.mw\n        self.intrinsic_open3d = open3d.camera.PinholeCameraIntrinsic(\n            width=self.W,\n            height=self.H,\n            fx=self.fx,\n            fy=self.fy,\n            cx=self.cx,\n            cy=self.cy,\n        )\n\n        self.min_pixels = 1500\n        # from scannetv2-labels.combined.tsv\n        #1-wall, 3-floor, 16-window, 41-ceiling, 232-light switch   0-unknown? 21-pillar 161-doorframe, shower walls-128, curtain-21, windowsill-141\n        self.background_cls_list = [-1, 0, 1, 3, 16, 41, 232, 21, 161, 128, 21]\n        self.bbox_scale = 0.2\n        self.inst_dict = {}\n\n    def load_poses(self, path):\n        self.poses = []\n        pose_paths = sorted(glob.glob(os.path.join(path, '*.txt')),\n                            key=lambda x: int(os.path.basename(x)[:-4]))\n        for pose_path in pose_paths:\n            with open(pose_path, \"r\") as f:\n                lines = f.readlines()\n            ls = []\n            for line in lines:\n                l = list(map(float, line.split(' ')))\n                ls.append(l)\n            c2w = np.array(ls).reshape(4, 4)\n            self.poses.append(c2w)\n\n    def __len__(self):\n        return self.n_img\n\n    def __getitem__(self, index):\n        bbox_scale = self.bbox_scale\n        color_path = self.color_paths[index]\n        depth_path = self.depth_paths[index]\n        inst_path = self.inst_paths[index]\n        sem_path = self.sem_paths[index]\n        color_data = cv2.imread(color_path).astype(np.uint8)\n        color_data = cv2.cvtColor(color_data, cv2.COLOR_BGR2RGB)\n        depth_data = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32)\n        depth_data = np.nan_to_num(depth_data, nan=0.)\n        T = None\n        if self.poses is not None:\n            T = self.poses[index]\n            if np.any(np.isinf(T)):\n                if index + 1 == self.__len__():\n                    print(\"pose inf!\")\n                    return None\n                return self.__getitem__(index + 1)\n\n        H, W = depth_data.shape\n        color_data = cv2.resize(color_data, (W, H), interpolation=cv2.INTER_LINEAR)\n        if self.edge:\n            edge = self.edge # crop image edge, there are invalid value on the edge of the color image\n            color_data = color_data[edge:-edge, edge:-edge]\n            depth_data = depth_data[edge:-edge, edge:-edge]\n        if self.depth_transform:\n            depth_data = self.depth_transform(depth_data)\n        bbox_dict = {}\n        if self.imap_mode:\n            inst_data = np.zeros_like(depth_data).astype(np.int32)\n        else:\n            inst_data = cv2.imread(inst_path, cv2.IMREAD_UNCHANGED)\n            inst_data = cv2.resize(inst_data, (W, H), interpolation=cv2.INTER_NEAREST).astype(np.int32)\n            sem_data = cv2.imread(sem_path, cv2.IMREAD_UNCHANGED)#.astype(np.int32)\n            sem_data = cv2.resize(sem_data, (W, H), interpolation=cv2.INTER_NEAREST)\n            if self.edge:\n                edge = self.edge\n                inst_data = inst_data[edge:-edge, edge:-edge]\n                sem_data = sem_data[edge:-edge, edge:-edge]\n            inst_data += 1  # shift from 0->1 , 0 is for background\n\n            # box filter\n            track_start = time.time()\n            masks = []\n            classes = []\n            # convert to list of arrays\n            obj_ids = np.unique(inst_data)\n            for obj_id in obj_ids:\n                mask = inst_data == obj_id\n                sem_cls = np.unique(sem_data[mask])\n                if sem_cls in self.background_cls_list:\n                    inst_data[mask] = 0     # set to background\n                    continue\n                masks.append(mask)\n                classes.append(obj_id)\n            T_CW = np.linalg.inv(T)\n            inst_data = box_filter(masks, classes, depth_data, self.inst_dict, self.intrinsic_open3d, T_CW, min_pixels=self.min_pixels)\n\n            merged_obj_ids = np.unique(inst_data)\n            for obj_id in merged_obj_ids:\n                mask = inst_data == obj_id\n                bbox2d = get_bbox2d(mask, bbox_scale=bbox_scale)\n                if bbox2d is None:\n                    inst_data[mask] = 0 # set to bg\n                else:\n                    min_x, min_y, max_x, max_y = bbox2d\n                    bbox_dict.update({int(obj_id): torch.from_numpy(np.array([min_x, max_x, min_y, max_y]).reshape(-1))})  # batch format\n            bbox_time = time.time()\n            # print(\"bbox time \", bbox_time - filter_time)\n            cv2.imshow(\"inst\", imgviz.label2rgb(inst_data))\n            cv2.waitKey(1)\n            print(\"frame {} track time {}\".format(index, bbox_time-track_start))\n\n        bbox_dict.update({0: torch.from_numpy(np.array([int(0), int(inst_data.shape[1]), 0, int(inst_data.shape[0])]))})  # bbox order\n        # wrap data to frame dict\n        T_obj = np.identity(4)\n        sample = {\"image\": color_data.transpose(1,0,2), \"depth\": depth_data.transpose(1,0), \"T\": T, \"T_obj\": T_obj}\n        if color_data is None or depth_data is None:\n            print(color_path)\n            print(depth_path)\n            raise ValueError\n\n        sample.update({\"obj\": inst_data.transpose(1,0)})\n        sample.update({\"bbox_dict\": bbox_dict})\n        return sample\n"
  },
  {
    "path": "embedding.py",
    "content": "import torch\nimport numpy as np\n\ndef positional_encoding(\n    tensor,\n    B_layer=None,\n    num_encoding_functions=6,\n    scale=10.\n):\n    if B_layer is not None:\n        embedding_gauss = B_layer(tensor / scale)\n        embedding_gauss = torch.sin(embedding_gauss)\n        embedding = embedding_gauss\n    else:\n        frequency_bands = 2.0 ** torch.linspace(\n            0.0,\n            num_encoding_functions - 1,\n            num_encoding_functions,\n            dtype=tensor.dtype,\n            device=tensor.device,\n        )\n\n        n_repeat = num_encoding_functions * 2 + 1\n        embedding = tensor[..., None, :].repeat(1, 1, n_repeat, 1) / scale\n        even_idx = np.arange(1, num_encoding_functions + 1) * 2\n        odd_idx = even_idx - 1\n\n        frequency_bands = frequency_bands[None, None, :, None]\n\n        embedding[:, :, even_idx, :] = torch.cos(\n            frequency_bands * embedding[:, :, even_idx, :])\n        embedding[:, :, odd_idx, :] = torch.sin(\n            frequency_bands * embedding[:, :, odd_idx, :])\n\n        n_dim = tensor.shape[-1]\n        embedding = embedding.view(\n            embedding.shape[0], embedding.shape[1], n_repeat * n_dim)\n        # print(\"embedding \", embedding.shape)\n        embedding = embedding.squeeze(0)\n\n    return embedding\n\nclass UniDirsEmbed(torch.nn.Module):\n    def __init__(self, min_deg=0, max_deg=2, scale=2.):\n        super(UniDirsEmbed, self).__init__()\n        self.min_deg = min_deg\n        self.max_deg = max_deg\n        self.n_freqs = max_deg - min_deg + 1\n        self.tensor_scale = torch.tensor(scale, requires_grad=False)\n\n        dirs = torch.tensor([\n        0.8506508, 0, 0.5257311,\n        0.809017, 0.5, 0.309017,\n        0.5257311, 0.8506508, 0,\n        1, 0, 0,\n        0.809017, 0.5, -0.309017,\n        0.8506508, 0, -0.5257311,\n        0.309017, 0.809017, -0.5,\n        0, 0.5257311, -0.8506508,\n        0.5, 0.309017, -0.809017,\n        0, 1, 0,\n        -0.5257311, 0.8506508, 0,\n        -0.309017, 0.809017, -0.5,\n        0, 0.5257311, 0.8506508,\n        -0.309017, 0.809017, 0.5,\n        0.309017, 0.809017, 0.5,\n        0.5, 0.309017, 0.809017,\n        0.5, -0.309017, 0.809017,\n        0, 0, 1,\n        -0.5, 0.309017, 0.809017,\n        -0.809017, 0.5, 0.309017,\n        -0.809017, 0.5, -0.309017\n        ]).reshape(-1, 3)\n\n        self.B_layer = torch.nn.Linear(3, 21, bias=False)\n        self.B_layer.weight.data = dirs\n\n        frequency_bands = 2.0 ** torch.linspace(self.min_deg, self.max_deg, self.n_freqs)\n        self.register_buffer(\"frequency_bands\", frequency_bands, persistent=False)\n        self.register_buffer(\"scale\", self.tensor_scale, persistent=True)\n\n    def forward(self, x):\n        tensor = x / self.scale   # functorch needs buffer, otherwise changed\n        proj = self.B_layer(tensor)\n        proj_bands = proj[..., None, :] * self.frequency_bands[None, None, :, None]\n        xb = proj_bands.view(list(proj.shape[:-1]) + [-1])\n        # embedding = torch.sin(torch.cat([xb, xb + 0.5 * np.pi], dim=-1))\n        embedding = torch.sin(xb * np.pi)\n        embedding = torch.cat([tensor] + [embedding], dim=-1)\n        # print(\"emb size \", embedding.shape)\n        return embedding"
  },
  {
    "path": "environment.yml",
    "content": "name: vmap\nchannels:\n  - pytorch\n  - defaults\ndependencies:\n  - _libgcc_mutex=0.1=main\n  - _openmp_mutex=5.1=1_gnu\n  - blas=1.0=mkl\n  - brotlipy=0.7.0=py38h27cfd23_1003\n  - bzip2=1.0.8=h7b6447c_0\n  - ca-certificates=2022.10.11=h06a4308_0\n  - certifi=2022.9.24=py38h06a4308_0\n  - cffi=1.15.1=py38h5eee18b_2\n  - charset-normalizer=2.0.4=pyhd3eb1b0_0\n  - cryptography=38.0.1=py38h9ce1e76_0\n  - cudatoolkit=11.3.1=h2bc3f7f_2\n  - ffmpeg=4.3=hf484d3e_0\n  - freetype=2.12.1=h4a9f257_0\n  - giflib=5.2.1=h7b6447c_0\n  - gmp=6.2.1=h295c915_3\n  - gnutls=3.6.15=he1e5248_0\n  - idna=3.4=py38h06a4308_0\n  - intel-openmp=2021.4.0=h06a4308_3561\n  - jpeg=9e=h7f8727e_0\n  - lame=3.100=h7b6447c_0\n  - lcms2=2.12=h3be6417_0\n  - ld_impl_linux-64=2.38=h1181459_1\n  - lerc=3.0=h295c915_0\n  - libdeflate=1.8=h7f8727e_5\n  - libffi=3.4.2=h6a678d5_6\n  - libgcc-ng=11.2.0=h1234567_1\n  - libgomp=11.2.0=h1234567_1\n  - libiconv=1.16=h7f8727e_2\n  - libidn2=2.3.2=h7f8727e_0\n  - libpng=1.6.37=hbc83047_0\n  - libstdcxx-ng=11.2.0=h1234567_1\n  - libtasn1=4.16.0=h27cfd23_0\n  - libtiff=4.4.0=hecacb30_2\n  - libunistring=0.9.10=h27cfd23_0\n  - libwebp=1.2.4=h11a3e52_0\n  - libwebp-base=1.2.4=h5eee18b_0\n  - lz4-c=1.9.3=h295c915_1\n  - mkl=2021.4.0=h06a4308_640\n  - mkl-service=2.4.0=py38h7f8727e_0\n  - mkl_fft=1.3.1=py38hd3c417c_0\n  - mkl_random=1.2.2=py38h51133e4_0\n  - ncurses=6.3=h5eee18b_3\n  - nettle=3.7.3=hbbd107a_1\n  - numpy=1.23.4=py38h14f4228_0\n  - numpy-base=1.23.4=py38h31eccc5_0\n  - openh264=2.1.1=h4ff587b_0\n  - openssl=1.1.1s=h7f8727e_0\n  - pillow=9.2.0=py38hace64e9_1\n  - pip=22.2.2=py38h06a4308_0\n  - pycparser=2.21=pyhd3eb1b0_0\n  - pyopenssl=22.0.0=pyhd3eb1b0_0\n  - pysocks=1.7.1=py38h06a4308_0\n  - python=3.8.15=h7a1cb2a_2\n  - pytorch=1.12.1=py3.8_cuda11.3_cudnn8.3.2_0\n  - pytorch-mutex=1.0=cuda\n  - readline=8.2=h5eee18b_0\n  - requests=2.28.1=py38h06a4308_0\n  - setuptools=65.5.0=py38h06a4308_0\n  - six=1.16.0=pyhd3eb1b0_1\n  - sqlite=3.40.0=h5082296_0\n  - tk=8.6.12=h1ccaba5_0\n  - torchaudio=0.12.1=py38_cu113\n  - torchvision=0.13.1=py38_cu113\n  - typing_extensions=4.3.0=py38h06a4308_0\n  - urllib3=1.26.12=py38h06a4308_0\n  - wheel=0.37.1=pyhd3eb1b0_0\n  - xz=5.2.6=h5eee18b_0\n  - zlib=1.2.13=h5eee18b_0\n  - zstd=1.5.2=ha4553b6_0\n  - pip:\n    - antlr4-python3-runtime==4.9.3\n    - bidict==0.22.0\n    - click==8.1.3\n    - dash==2.7.0\n    - dash-core-components==2.0.0\n    - dash-html-components==2.0.0\n    - dash-table==5.0.0\n    - entrypoints==0.4\n    - flask==2.2.2\n    - functorch==0.2.0\n    - fvcore==0.1.5.post20221122\n    - h5py==3.7.0\n    - imageio==2.22.4\n    - importlib-metadata==5.1.0\n    - iopath==0.1.10\n    - itsdangerous==2.1.2\n    - lpips==0.1.4\n    - nbformat==5.5.0\n    - omegaconf==2.2.3\n    - open3d==0.16.0\n    - pexpect==4.8.0\n    - plotly==5.11.0\n    - pycocotools==2.0.6\n    - pyquaternion==0.9.9\n    - scikit-image==0.19.3\n    - tenacity==8.1.0\n    - termcolor==2.1.1\n    - timm==0.6.12\n    - werkzeug==2.2.2\n    - yacs==0.1.8\n    - zipp==3.11.0\n\n"
  },
  {
    "path": "image_transforms.py",
    "content": "import cv2\nimport numpy as np\n\n\nclass BGRtoRGB(object):\n    \"\"\"bgr format to rgb\"\"\"\n\n    def __call__(self, image):\n        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n        return image\n\n\nclass DepthScale(object):\n    \"\"\"scale depth to meters\"\"\"\n\n    def __init__(self, scale):\n        self.scale = scale\n\n    def __call__(self, depth):\n        depth = depth.astype(np.float32)\n        return depth * self.scale\n\n\nclass DepthFilter(object):\n    \"\"\"scale depth to meters\"\"\"\n\n    def __init__(self, max_depth):\n        self.max_depth = max_depth\n\n    def __call__(self, depth):\n        far_mask = depth > self.max_depth\n        depth[far_mask] = 0.\n        return depth\n\n\nclass Undistort(object):\n    \"\"\"scale depth to meters\"\"\"\n\n    def __init__(self,\n                 w, h,\n                 fx, fy, cx, cy,\n                 k1, k2, k3, k4, k5, k6,\n                 p1, p2,\n                 interpolation):\n        self.interpolation = interpolation\n        K = np.array([[fx, 0., cx],\n                      [0., fy, cy],\n                      [0., 0., 1.]])\n\n        self.map1x, self.map1y = cv2.initUndistortRectifyMap(\n            K,\n            np.array([k1, k2, p1, p2, k3, k4, k5, k6]),\n            np.eye(3),\n            K,\n            (w, h),\n            cv2.CV_32FC1)\n\n    def __call__(self, im):\n        im = cv2.remap(im, self.map1x, self.map1y, self.interpolation)\n        return im\n"
  },
  {
    "path": "loss.py",
    "content": "import torch\nimport render_rays\nimport torch.nn.functional as F\n\ndef step_batch_loss(alpha, color, gt_depth, gt_color, sem_labels, mask_depth, z_vals,\n                    color_scaling=5.0, opacity_scaling=10.0):\n    \"\"\"\n    apply depth where depth are valid                                       -> mask_depth\n    apply depth, color loss on this_obj & unkown_obj == (~other_obj)        -> mask_obj\n    apply occupancy/opacity loss on this_obj & other_obj == (~unknown_obj)  -> mask_sem\n\n    output:\n    loss for training\n    loss_all for per sample, could be used for active sampling, replay buffer\n    \"\"\"\n    mask_obj = sem_labels != 0\n    mask_obj = mask_obj.detach()\n    mask_sem = sem_labels != 2\n    mask_sem = mask_sem.detach()\n\n    alpha = alpha.squeeze(dim=-1)\n    color = color.squeeze(dim=-1)\n\n    occupancy = render_rays.occupancy_activation(alpha)\n    termination = render_rays.occupancy_to_termination(occupancy, is_batch=True)  # shape [num_batch, num_ray, points_per_ray]\n\n    render_depth = render_rays.render(termination, z_vals)\n    diff_sq = (z_vals - render_depth[..., None]) ** 2\n    var = render_rays.render(termination, diff_sq).detach()  # must detach here!\n    render_color = render_rays.render(termination[..., None], color, dim=-2)\n    render_opacity = torch.sum(termination, dim=-1)     # similar to obj-nerf opacity loss\n\n    # 2D depth loss: only on valid depth & mask\n    # [mask_depth & mask_obj]\n    # loss_all = torch.zeros_like(render_depth)\n    loss_depth_raw = render_rays.render_loss(render_depth, gt_depth, loss=\"L1\", normalise=False)\n    loss_depth = torch.mul(loss_depth_raw, mask_depth & mask_obj)   # keep dim but set invalid element be zero\n    # loss_all += loss_depth\n    loss_depth = render_rays.reduce_batch_loss(loss_depth, var=var, avg=True, mask=mask_depth & mask_obj)   # apply var as imap\n\n    # 2D color loss: only on obj mask\n    # [mask_obj]\n    loss_col_raw = render_rays.render_loss(render_color, gt_color, loss=\"L1\", normalise=False)\n    loss_col = torch.mul(loss_col_raw.sum(-1), mask_obj)\n    # loss_all += loss_col / 3. * color_scaling\n    loss_col = render_rays.reduce_batch_loss(loss_col, var=None, avg=True, mask=mask_obj)\n\n    # 2D occupancy/opacity loss: apply except unknown area\n    # [mask_sem]\n    # loss_opacity_raw = F.mse_loss(torch.clamp(render_opacity, 0, 1), mask_obj.float().detach()) # encourage other_obj to be empty, while this_obj to be solid\n    # print(\"opacity max \", torch.max(render_opacity.max()))\n    # print(\"opacity min \", torch.max(render_opacity.min()))\n    loss_opacity_raw = render_rays.render_loss(render_opacity, mask_obj.float(), loss=\"L1\", normalise=False)\n    loss_opacity = torch.mul(loss_opacity_raw, mask_sem)  # but ignore -1 unkown area e.g., mask edges\n    # loss_all += loss_opacity * opacity_scaling\n    loss_opacity = render_rays.reduce_batch_loss(loss_opacity, var=None, avg=True, mask=mask_sem)   # todo var\n\n    # loss for bp\n    l_batch = loss_depth + loss_col * color_scaling + loss_opacity * opacity_scaling\n    loss = l_batch.sum()\n\n    return loss, None       # return loss, loss_all.detach()\n"
  },
  {
    "path": "metric/eval_3D_obj.py",
    "content": "import numpy as np\nfrom tqdm import tqdm\nimport trimesh\nfrom metrics import accuracy, completion, completion_ratio\nimport os\nimport json\n\ndef calc_3d_metric(mesh_rec, mesh_gt, N=200000):\n    \"\"\"\n    3D reconstruction metric.\n    \"\"\"\n    metrics = [[] for _ in range(4)]\n    transform, extents = trimesh.bounds.oriented_bounds(mesh_gt)\n    extents = extents / 0.9  # enlarge 0.9\n    box = trimesh.creation.box(extents=extents, transform=np.linalg.inv(transform))\n    mesh_rec = mesh_rec.slice_plane(box.facets_origin, -box.facets_normal)\n    if mesh_rec.vertices.shape[0] == 0:\n        print(\"no mesh found\")\n        return\n    rec_pc = trimesh.sample.sample_surface(mesh_rec, N)\n    rec_pc_tri = trimesh.PointCloud(vertices=rec_pc[0])\n\n    gt_pc = trimesh.sample.sample_surface(mesh_gt, N)\n    gt_pc_tri = trimesh.PointCloud(vertices=gt_pc[0])\n    accuracy_rec = accuracy(gt_pc_tri.vertices, rec_pc_tri.vertices)\n    completion_rec = completion(gt_pc_tri.vertices, rec_pc_tri.vertices)\n    completion_ratio_rec = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.05)\n    completion_ratio_rec_1 = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.01)\n\n    # accuracy_rec *= 100  # convert to cm\n    # completion_rec *= 100  # convert to cm\n    # completion_ratio_rec *= 100  # convert to %\n    # print('accuracy: ', accuracy_rec)\n    # print('completion: ', completion_rec)\n    # print('completion ratio: ', completion_ratio_rec)\n    # print(\"completion_ratio_rec_1cm \", completion_ratio_rec_1)\n    metrics[0].append(accuracy_rec)\n    metrics[1].append(completion_rec)\n    metrics[2].append(completion_ratio_rec_1)\n    metrics[3].append(completion_ratio_rec)\n    return metrics\n\ndef get_gt_bg_mesh(gt_dir, background_cls_list):\n    with open(os.path.join(gt_dir, \"info_semantic.json\")) as f:\n        label_obj_list = json.load(f)[\"objects\"]\n\n    bg_meshes = []\n    for obj in label_obj_list:\n        if int(obj[\"class_id\"]) in background_cls_list:\n            obj_file = os.path.join(gt_dir, \"mesh_semantic.ply_\" + str(int(obj[\"id\"])) + \".ply\")\n            obj_mesh = trimesh.load(obj_file)\n            bg_meshes.append(obj_mesh)\n\n    bg_mesh = trimesh.util.concatenate(bg_meshes)\n    return bg_mesh\n\ndef get_obj_ids(obj_dir):\n    files = os.listdir(obj_dir)\n    obj_ids = []\n    for f in files:\n        obj_id = f.split(\"obj\")[1][:-1]\n        if obj_id == '':\n            continue\n        obj_ids.append(int(obj_id))\n    return obj_ids\n\n\nif __name__ == \"__main__\":\n    background_cls_list = [5, 12, 30, 31, 40, 60, 92, 93, 95, 97, 98, 79]\n    exp_name = [\"room0\", \"room1\", \"room2\", \"office0\", \"office1\", \"office2\", \"office3\", \"office4\"]\n    data_dir = \"/home/xin/data/vmap/\"\n    log_dir = \"../logs/iMAP/\"\n    # log_dir = \"../logs/vMAP/\"\n\n    for exp in tqdm(exp_name):\n        gt_dir = os.path.join(data_dir, exp[:-1]+\"_\"+exp[-1]+\"/habitat\")\n        exp_dir = os.path.join(log_dir, exp)\n        mesh_dir = os.path.join(exp_dir, \"scene_mesh\")\n        output_path = os.path.join(exp_dir, \"eval_mesh\")\n        os.makedirs(output_path, exist_ok=True)\n        metrics_3D = [[] for _ in range(4)]\n\n        # get obj ids\n        # obj_ids = np.loadtxt()    # todo use a pre-defined obj list or use vMAP results\n        obj_ids = get_obj_ids(mesh_dir.replace(\"iMAP\", \"vMAP\"))\n        for obj_id in tqdm(obj_ids):\n            if obj_id == 0: # for bg\n                N = 200000\n                mesh_gt = get_gt_bg_mesh(gt_dir, background_cls_list)\n            else:   # for obj\n                N = 10000\n                obj_file = os.path.join(gt_dir, \"mesh_semantic.ply_\" + str(obj_id) + \".ply\")\n                mesh_gt = trimesh.load(obj_file)\n\n            if \"vMAP\" in exp_dir:\n                rec_meshfile = os.path.join(mesh_dir, \"frame_1999_obj\"+str(obj_id)+\".obj\")\n            elif \"iMAP\" in exp_dir:\n                rec_meshfile = os.path.join(mesh_dir, \"frame_1999_obj0.obj\")\n            else:\n                print(\"Not Implement\")\n                exit(-1)\n\n            mesh_rec = trimesh.load(rec_meshfile)\n            # mesh_rec.invert()   # niceslam mesh face needs invert\n            metrics = calc_3d_metric(mesh_rec, mesh_gt, N=N)  # for objs use 10k, for scene use 200k points\n            if metrics is None:\n                continue\n            np.save(output_path + '/metric_obj{}.npy'.format(obj_id), np.array(metrics))\n            metrics_3D[0].append(metrics[0])    # acc\n            metrics_3D[1].append(metrics[1])    # comp\n            metrics_3D[2].append(metrics[2])    # comp ratio 1cm\n            metrics_3D[3].append(metrics[3])    # comp ratio 5cm\n        metrics_3D = np.array(metrics_3D)\n        np.save(output_path + '/metrics_3D_obj.npy', metrics_3D)\n        print(\"metrics 3D obj \\n Acc | Comp | Comp Ratio 1cm | Comp Ratio 5cm \\n\", metrics_3D.mean(axis=1))\n        print(\"-----------------------------------------\")\n        print(\"finish exp \", exp)"
  },
  {
    "path": "metric/eval_3D_scene.py",
    "content": "import numpy as np\nfrom tqdm import tqdm\nimport trimesh\nfrom metrics import accuracy, completion, completion_ratio\nimport os\n\ndef calc_3d_metric(mesh_rec, mesh_gt, N=200000):\n    \"\"\"\n    3D reconstruction metric.\n    \"\"\"\n    metrics = [[] for _ in range(4)]\n    rec_pc = trimesh.sample.sample_surface(mesh_rec, N)\n    rec_pc_tri = trimesh.PointCloud(vertices=rec_pc[0])\n\n    gt_pc = trimesh.sample.sample_surface(mesh_gt, N)\n    gt_pc_tri = trimesh.PointCloud(vertices=gt_pc[0])\n    accuracy_rec = accuracy(gt_pc_tri.vertices, rec_pc_tri.vertices)\n    completion_rec = completion(gt_pc_tri.vertices, rec_pc_tri.vertices)\n    completion_ratio_rec = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.05)\n    completion_ratio_rec_1 = completion_ratio(gt_pc_tri.vertices, rec_pc_tri.vertices, 0.01)\n\n    # accuracy_rec *= 100  # convert to cm\n    # completion_rec *= 100  # convert to cm\n    # completion_ratio_rec *= 100  # convert to %\n    # print('accuracy: ', accuracy_rec)\n    # print('completion: ', completion_rec)\n    # print('completion ratio: ', completion_ratio_rec)\n    # print(\"completion_ratio_rec_1cm \", completion_ratio_rec_1)\n    metrics[0].append(accuracy_rec)\n    metrics[1].append(completion_rec)\n    metrics[2].append(completion_ratio_rec_1)\n    metrics[3].append(completion_ratio_rec)\n    return metrics\n\n\nif __name__ == \"__main__\":\n    exp_name = [\"room0\", \"room1\", \"room2\", \"office0\", \"office1\", \"office2\", \"office3\", \"office4\"]\n    data_dir = \"/home/xin/data/vmap/\"\n    # log_dir = \"../logs/iMAP/\"\n    log_dir = \"../logs/vMAP/\"\n\n    for exp in tqdm(exp_name):\n        gt_dir = os.path.join(data_dir, exp[:-1]+\"_\"+exp[-1]+\"/habitat\")\n        exp_dir = os.path.join(log_dir, exp)\n        mesh_dir = os.path.join(exp_dir, \"scene_mesh\")\n        output_path = os.path.join(exp_dir, \"eval_mesh\")\n        os.makedirs(output_path, exist_ok=True)\n        if \"vMAP\" in exp_dir:\n            mesh_list = os.listdir(mesh_dir)\n            if \"frame_1999_scene.obj\" in mesh_list:\n                rec_meshfile = os.path.join(mesh_dir, \"frame_1999_scene.obj\")\n            else: # compose obj into scene mesh\n                scene_meshes = []\n                for f in mesh_list:\n                    _, f_type = os.path.splitext(f)\n                    if f_type == \".obj\" or f_type == \".ply\":\n                        obj_mesh = trimesh.load(os.path.join(mesh_dir, f))\n                        scene_meshes.append(obj_mesh)\n                scene_mesh = trimesh.util.concatenate(scene_meshes)\n                scene_mesh.export(os.path.join(mesh_dir, \"frame_1999_scene.obj\"))\n                rec_meshfile = os.path.join(mesh_dir, \"frame_1999_scene.obj\")\n        elif \"iMAP\" in exp_dir: # obj0 is the scene mesh\n            rec_meshfile = os.path.join(mesh_dir, \"frame_1999_obj0.obj\")\n        else:\n            print(\"Not Implement\")\n            exit(-1)\n        gt_mesh_files = os.listdir(gt_dir)\n        gt_mesh_file = os.path.join(gt_dir, \"../mesh.ply\")\n        mesh_rec = trimesh.load(rec_meshfile)\n        # mesh_rec.invert()   # niceslam mesh face needs invert\n        metrics_3D = [[] for _ in range(4)]\n        mesh_gt = trimesh.load(gt_mesh_file)\n        metrics = calc_3d_metric(mesh_rec, mesh_gt, N=200000)  # for objs use 10k, for scene use 200k points\n        metrics_3D[0].append(metrics[0])    # acc\n        metrics_3D[1].append(metrics[1])    # comp\n        metrics_3D[2].append(metrics[2])    # comp ratio 1cm\n        metrics_3D[3].append(metrics[3])    # comp ratio 5cm\n        metrics_3D = np.array(metrics_3D)\n        np.save(output_path + '/metrics_3D_scene.npy', metrics_3D)\n        print(\"metrics 3D scene \\n Acc | Comp | Comp Ratio 1cm | Comp Ratio 5cm \\n \", metrics_3D.mean(axis=1))\n        print(\"-----------------------------------------\")\n        print(\"finish exp \", exp)"
  },
  {
    "path": "metric/metrics.py",
    "content": "import numpy as np\nfrom scipy.spatial import cKDTree as KDTree\n\ndef completion_ratio(gt_points, rec_points, dist_th=0.01):\n    gen_points_kd_tree = KDTree(rec_points)\n    one_distances, one_vertex_ids = gen_points_kd_tree.query(gt_points)\n    completion = np.mean((one_distances < dist_th).astype(np.float))\n    return completion\n\n\ndef accuracy(gt_points, rec_points):\n    gt_points_kd_tree = KDTree(gt_points)\n    two_distances, two_vertex_ids = gt_points_kd_tree.query(rec_points)\n    gen_to_gt_chamfer = np.mean(two_distances)\n    return gen_to_gt_chamfer\n\n\ndef completion(gt_points, rec_points):\n    gt_points_kd_tree = KDTree(rec_points)\n    one_distances, two_vertex_ids = gt_points_kd_tree.query(gt_points)\n    gt_to_gen_chamfer = np.mean(one_distances)\n    return gt_to_gen_chamfer\n\n\ndef chamfer(gt_points, rec_points):\n    # one direction\n    gen_points_kd_tree = KDTree(rec_points)\n    one_distances, one_vertex_ids = gen_points_kd_tree.query(gt_points)\n    gt_to_gen_chamfer = np.mean(one_distances)\n\n    # other direction\n    gt_points_kd_tree = KDTree(gt_points)\n    two_distances, two_vertex_ids = gt_points_kd_tree.query(rec_points)\n    gen_to_gt_chamfer = np.mean(two_distances)\n\n    return (gt_to_gen_chamfer + gen_to_gt_chamfer) / 2.\n\n"
  },
  {
    "path": "model.py",
    "content": "import torch\nimport torch.nn as nn\n\ndef init_weights(m, init_fn=torch.nn.init.xavier_normal_):\n    if type(m) == torch.nn.Linear:\n        init_fn(m.weight)\n\n\ndef fc_block(in_f, out_f):\n    return torch.nn.Sequential(\n        torch.nn.Linear(in_f, out_f),\n        torch.nn.ReLU(out_f)\n    )\n\n\nclass OccupancyMap(torch.nn.Module):\n    def __init__(\n        self,\n        emb_size1,\n        emb_size2,\n        hidden_size=256,\n        do_color=True,\n        hidden_layers_block=1\n    ):\n        super(OccupancyMap, self).__init__()\n        self.do_color = do_color\n        self.embedding_size1 = emb_size1\n        self.in_layer = fc_block(self.embedding_size1, hidden_size)\n\n        hidden1 = [fc_block(hidden_size, hidden_size)\n                   for _ in range(hidden_layers_block)]\n        self.mid1 = torch.nn.Sequential(*hidden1)\n        # self.embedding_size2 = 21*(5+1)+3 - self.embedding_size # 129-66=63 32\n        self.embedding_size2 = emb_size2\n        self.cat_layer = fc_block(\n            hidden_size + self.embedding_size1, hidden_size)\n\n        # self.cat_layer = fc_block(\n        #     hidden_size , hidden_size)\n\n        hidden2 = [fc_block(hidden_size, hidden_size)\n                   for _ in range(hidden_layers_block)]\n        self.mid2 = torch.nn.Sequential(*hidden2)\n\n        self.out_alpha = torch.nn.Linear(hidden_size, 1)\n\n        if self.do_color:\n            self.color_linear = fc_block(self.embedding_size2 + hidden_size, hidden_size)\n            self.out_color = torch.nn.Linear(hidden_size, 3)\n\n        # self.relu = torch.nn.functional.relu\n        self.sigmoid = torch.sigmoid\n\n    def forward(self, x,\n                noise_std=None,\n                do_alpha=True,\n                do_color=True,\n                do_cat=True):\n        fc1 = self.in_layer(x[...,:self.embedding_size1])\n        fc2 = self.mid1(fc1)\n        # fc3 = self.cat_layer(fc2)\n        if do_cat:\n            fc2_x = torch.cat((fc2, x[...,:self.embedding_size1]), dim=-1)\n            fc3 = self.cat_layer(fc2_x)\n        else:\n            fc3 = fc2\n        fc4 = self.mid2(fc3)\n\n        alpha = None\n        if do_alpha:\n            raw = self.out_alpha(fc4)   # todo ignore noise\n            if noise_std is not None:\n                noise = torch.randn(raw.shape, device=x.device) * noise_std\n                raw = raw + noise\n\n            # alpha = self.relu(raw) * scale    # nerf\n            alpha = raw * 10. #self.scale     # unisurf\n\n        color = None\n        if self.do_color and do_color:\n            fc4_cat = self.color_linear(torch.cat((fc4, x[..., self.embedding_size1:]), dim=-1))\n            raw_color = self.out_color(fc4_cat)\n            color = self.sigmoid(raw_color)\n\n        return alpha, color\n\n\n\n"
  },
  {
    "path": "render_rays.py",
    "content": "import torch\nimport numpy as np\n\ndef occupancy_activation(alpha, distances=None):\n    # occ = 1.0 - torch.exp(-alpha * distances)\n    occ = torch.sigmoid(alpha)    # unisurf\n\n    return occ\n\ndef alpha_to_occupancy(depths, dirs, alpha, add_last=False):\n    interval_distances = depths[..., 1:] - depths[..., :-1]\n    if add_last:\n        last_distance = torch.empty(\n            (depths.shape[0], 1),\n            device=depths.device,\n            dtype=depths.dtype).fill_(0.1)\n        interval_distances = torch.cat(\n            [interval_distances, last_distance], dim=-1)\n\n    dirs_norm = torch.norm(dirs, dim=-1)\n    interval_distances = interval_distances * dirs_norm[:, None]\n    occ = occupancy_activation(alpha, interval_distances)\n\n    return occ\n\ndef occupancy_to_termination(occupancy, is_batch=False):\n    if is_batch:\n        first = torch.ones(list(occupancy.shape[:2]) + [1], device=occupancy.device)\n        free_probs = (1. - occupancy + 1e-10)[:, :, :-1]\n    else:\n        first = torch.ones([occupancy.shape[0], 1], device=occupancy.device)\n        free_probs = (1. - occupancy + 1e-10)[:, :-1]\n    free_probs = torch.cat([first, free_probs], dim=-1)\n    term_probs = occupancy * torch.cumprod(free_probs, dim=-1)\n\n    # using escape probability\n    # occupancy = occupancy[:, :-1]\n    # first = torch.ones([occupancy.shape[0], 1], device=occupancy.device)\n    # free_probs = (1. - occupancy + 1e-10)\n    # free_probs = torch.cat([first, free_probs], dim=-1)\n    # last = torch.ones([occupancy.shape[0], 1], device=occupancy.device)\n    # occupancy = torch.cat([occupancy, last], dim=-1)\n    # term_probs = occupancy * torch.cumprod(free_probs, dim=-1)\n\n    return term_probs\n\ndef render(termination, vals, dim=-1):\n    weighted_vals = termination * vals\n    render = weighted_vals.sum(dim=dim)\n\n    return render\n\ndef render_loss(render, gt, loss=\"L1\", normalise=False):\n    residual = render - gt\n    if loss == \"L2\":\n        loss_mat = residual ** 2\n    elif loss == \"L1\":\n        loss_mat = torch.abs(residual)\n    else:\n        print(\"loss type {} not implemented!\".format(loss))\n\n    if normalise:\n        loss_mat = loss_mat / gt\n\n    return loss_mat\n\ndef reduce_batch_loss(loss_mat, var=None, avg=True, mask=None, loss_type=\"L1\"):\n    mask_num = torch.sum(mask, dim=-1)\n    if (mask_num == 0).any():   # no valid sample, return 0 loss\n        loss = torch.zeros_like(loss_mat)\n        if avg:\n            loss = torch.mean(loss, dim=-1)\n        return loss\n    if var is not None:\n        eps = 1e-4\n        if loss_type == \"L2\":\n            information = 1.0 / (var + eps)\n        elif loss_type == \"L1\":\n            information = 1.0 / (torch.sqrt(var) + eps)\n\n        loss_weighted = loss_mat * information\n    else:\n        loss_weighted = loss_mat\n\n    if avg:\n        if mask is not None:\n            loss = (torch.sum(loss_weighted, dim=-1)/(torch.sum(mask, dim=-1)+1e-10))\n            if (loss > 100000).any():\n                print(\"loss explode\")\n                exit(-1)\n        else:\n            loss = torch.mean(loss_weighted, dim=-1).sum()\n    else:\n        loss = loss_weighted\n\n    return loss\n\ndef make_3D_grid(occ_range=[-1., 1.], dim=256, device=\"cuda:0\", transform=None, scale=None):\n    t = torch.linspace(occ_range[0], occ_range[1], steps=dim, device=device)\n    grid = torch.meshgrid(t, t, t)\n    grid_3d = torch.cat(\n        (grid[0][..., None],\n         grid[1][..., None],\n         grid[2][..., None]), dim=3\n    )\n\n    if scale is not None:\n        grid_3d = grid_3d * scale\n    if transform is not None:\n        R1 = transform[None, None, None, 0, :3]\n        R2 = transform[None, None, None, 1, :3]\n        R3 = transform[None, None, None, 2, :3]\n\n        grid1 = (R1 * grid_3d).sum(-1, keepdim=True)\n        grid2 = (R2 * grid_3d).sum(-1, keepdim=True)\n        grid3 = (R3 * grid_3d).sum(-1, keepdim=True)\n        grid_3d = torch.cat([grid1, grid2, grid3], dim=-1)\n\n        trans = transform[None, None, None, :3, 3]\n        grid_3d = grid_3d + trans\n\n    return grid_3d\n"
  },
  {
    "path": "train.py",
    "content": "import time\nimport loss\nfrom vmap import *\nimport utils\nimport open3d\nimport dataset\nimport vis\nfrom functorch import vmap\nimport argparse\nfrom cfg import Config\nimport shutil\n\nif __name__ == \"__main__\":\n    #############################################\n    # init config\n    torch.backends.cudnn.enabled = True\n    torch.backends.cudnn.benchmark = True\n\n    # setting params\n    parser = argparse.ArgumentParser(description='Model training for single GPU')\n    parser.add_argument('--logdir', default='./logs/debug',\n                        type=str)\n    parser.add_argument('--config',\n                        default='./configs/Replica/config_replica_room0_vMAP.json',\n                        type=str)\n    parser.add_argument('--save_ckpt',\n                        default=False,\n                        type=bool)\n    args = parser.parse_args()\n\n    log_dir = args.logdir\n    config_file = args.config\n    save_ckpt = args.save_ckpt\n    os.makedirs(log_dir, exist_ok=True)  # saving logs\n    shutil.copy(config_file, log_dir)\n    cfg = Config(config_file)       # config params\n    n_sample_per_step = cfg.n_per_optim\n    n_sample_per_step_bg = cfg.n_per_optim_bg\n\n    # param for vis\n    vis3d = open3d.visualization.Visualizer()\n    vis3d.create_window(window_name=\"3D mesh vis\",\n                        width=cfg.W,\n                        height=cfg.H,\n                        left=600, top=50)\n    view_ctl = vis3d.get_view_control()\n    view_ctl.set_constant_z_far(10.)\n\n    # set camera\n    cam_info = cameraInfo(cfg)\n    intrinsic_open3d = open3d.camera.PinholeCameraIntrinsic(\n        width=cfg.W,\n        height=cfg.H,\n        fx=cfg.fx,\n        fy=cfg.fy,\n        cx=cfg.cx,\n        cy=cfg.cy)\n\n    # init obj_dict\n    obj_dict = {}   # only objs\n    vis_dict = {}   # including bg\n\n    # init for training\n    AMP = False\n    if AMP:\n        scaler = torch.cuda.amp.GradScaler()  # amp https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/\n    optimiser = torch.optim.AdamW([torch.autograd.Variable(torch.tensor(0))], lr=cfg.learning_rate, weight_decay=cfg.weight_decay)\n\n    #############################################\n    # init data stream\n    if not cfg.live_mode:\n        # load dataset\n        dataloader = dataset.init_loader(cfg)\n        dataloader_iterator = iter(dataloader)\n        dataset_len = len(dataloader)\n    else:\n        dataset_len = 1000000\n        # # init ros node\n        # torch.multiprocessing.set_start_method('spawn')  # spawn\n        # import ros_nodes\n        # track_to_map_Buffer = torch.multiprocessing.Queue(maxsize=5)\n        # # track_to_vis_T_WC = torch.multiprocessing.Queue(maxsize=1)\n        # kfs_que = torch.multiprocessing.Queue(maxsize=5)  # to store one more buffer\n        # track_p = torch.multiprocessing.Process(target=ros_nodes.Tracking,\n        #                                              args=(\n        #                                              (cfg), (track_to_map_Buffer), (None),\n        #                                              (kfs_que), (True),))\n        # track_p.start()\n\n\n    # init vmap\n    fc_models, pe_models = [], []\n    scene_bg = None\n\n    for frame_id in tqdm(range(dataset_len)):\n        print(\"*********************************************\")\n        # get new frame data\n        with performance_measure(f\"getting next data\"):\n            if not cfg.live_mode:\n                # get data from dataloader\n                sample = next(dataloader_iterator)\n            else:\n                pass\n\n        if sample is not None:  # new frame\n            last_frame_time = time.time()\n            with performance_measure(f\"Appending data\"):\n                rgb = sample[\"image\"].to(cfg.data_device)\n                depth = sample[\"depth\"].to(cfg.data_device)\n                twc = sample[\"T\"].to(cfg.data_device)\n                bbox_dict = sample[\"bbox_dict\"]\n                if \"frame_id\" in sample.keys():\n                    live_frame_id = sample[\"frame_id\"]\n                else:\n                    live_frame_id = frame_id\n                if not cfg.live_mode:\n                    inst = sample[\"obj\"].to(cfg.data_device)\n                    obj_ids = torch.unique(inst)\n                else:\n                    inst_data_dict = sample[\"obj\"]\n                    obj_ids = inst_data_dict.keys()\n                # append new frame info to objs in current view\n                for obj_id in obj_ids:\n                    if obj_id == -1:    # unsured area\n                        continue\n                    obj_id = int(obj_id)\n                    # convert inst mask to state\n                    if not cfg.live_mode:\n                        state = torch.zeros_like(inst, dtype=torch.uint8, device=cfg.data_device)\n                        state[inst == obj_id] = 1\n                        state[inst == -1] = 2\n                    else:\n                        inst_mask = inst_data_dict[obj_id].permute(1,0)\n                        label_list = torch.unique(inst_mask).tolist()\n                        state = torch.zeros_like(inst_mask, dtype=torch.uint8, device=cfg.data_device)\n                        state[inst_mask == obj_id] = 1\n                        state[inst_mask == -1] = 2\n                    bbox = bbox_dict[obj_id]\n                    if obj_id in vis_dict.keys():\n                        scene_obj = vis_dict[obj_id]\n                        scene_obj.append_keyframe(rgb, depth, state, bbox, twc, live_frame_id)\n                    else: # init scene_obj\n                        if len(obj_dict.keys()) >= cfg.max_n_models:\n                            print(\"models full!!!! current num \", len(obj_dict.keys()))\n                            continue\n                        print(\"init new obj \", obj_id)\n                        if cfg.do_bg and obj_id == 0:   # todo param\n                            scene_bg = sceneObject(cfg, obj_id, rgb, depth, state, bbox, twc, live_frame_id)\n                            # scene_bg.init_obj_center(intrinsic_open3d, depth, state, twc)\n                            optimiser.add_param_group({\"params\": scene_bg.trainer.fc_occ_map.parameters(), \"lr\": cfg.learning_rate, \"weight_decay\": cfg.weight_decay})\n                            optimiser.add_param_group({\"params\": scene_bg.trainer.pe.parameters(), \"lr\": cfg.learning_rate, \"weight_decay\": cfg.weight_decay})\n                            vis_dict.update({obj_id: scene_bg})\n                        else:\n                            scene_obj = sceneObject(cfg, obj_id, rgb, depth, state, bbox, twc, live_frame_id)\n                            # scene_obj.init_obj_center(intrinsic_open3d, depth, state, twc)\n                            obj_dict.update({obj_id: scene_obj})\n                            vis_dict.update({obj_id: scene_obj})\n                            # params = [scene_obj.trainer.fc_occ_map.parameters(), scene_obj.trainer.pe.parameters()]\n                            optimiser.add_param_group({\"params\": scene_obj.trainer.fc_occ_map.parameters(), \"lr\": cfg.learning_rate, \"weight_decay\": cfg.weight_decay})\n                            optimiser.add_param_group({\"params\": scene_obj.trainer.pe.parameters(), \"lr\": cfg.learning_rate, \"weight_decay\": cfg.weight_decay})\n                            if cfg.training_strategy == \"vmap\":\n                                update_vmap_model = True\n                                fc_models.append(obj_dict[obj_id].trainer.fc_occ_map)\n                                pe_models.append(obj_dict[obj_id].trainer.pe)\n\n                        # ###################################\n                        # # measure trainable params in total\n                        # total_params = 0\n                        # obj_k = obj_dict[obj_id]\n                        # for p in obj_k.trainer.fc_occ_map.parameters():\n                        #     if p.requires_grad:\n                        #         total_params += p.numel()\n                        # for p in obj_k.trainer.pe.parameters():\n                        #     if p.requires_grad:\n                        #         total_params += p.numel()\n                        # print(\"total param \", total_params)\n\n        # dynamically add vmap\n        with performance_measure(f\"add vmap\"):\n            if cfg.training_strategy == \"vmap\" and update_vmap_model == True:\n                fc_model, fc_param, fc_buffer = utils.update_vmap(fc_models, optimiser)\n                pe_model, pe_param, pe_buffer = utils.update_vmap(pe_models, optimiser)\n                update_vmap_model = False\n\n\n        ##################################################################\n        # training data preperation, get training data for all objs\n        Batch_N_gt_depth = []\n        Batch_N_gt_rgb = []\n        Batch_N_depth_mask = []\n        Batch_N_obj_mask = []\n        Batch_N_input_pcs = []\n        Batch_N_sampled_z = []\n\n        with performance_measure(f\"Sampling over {len(obj_dict.keys())} objects,\"):\n            if cfg.do_bg and scene_bg is not None:\n                gt_rgb, gt_depth, valid_depth_mask, obj_mask, input_pcs, sampled_z \\\n                    = scene_bg.get_training_samples(cfg.n_iter_per_frame * cfg.win_size_bg, cfg.n_samples_per_frame_bg,\n                                                    cam_info.rays_dir_cache)\n                bg_gt_depth = gt_depth.reshape([gt_depth.shape[0] * gt_depth.shape[1]])\n                bg_gt_rgb = gt_rgb.reshape([gt_rgb.shape[0] * gt_rgb.shape[1], gt_rgb.shape[2]])\n                bg_valid_depth_mask = valid_depth_mask\n                bg_obj_mask = obj_mask\n                bg_input_pcs = input_pcs.reshape(\n                    [input_pcs.shape[0] * input_pcs.shape[1], input_pcs.shape[2], input_pcs.shape[3]])\n                bg_sampled_z = sampled_z.reshape([sampled_z.shape[0] * sampled_z.shape[1], sampled_z.shape[2]])\n\n            for obj_id, obj_k in obj_dict.items():\n                gt_rgb, gt_depth, valid_depth_mask, obj_mask, input_pcs, sampled_z \\\n                    = obj_k.get_training_samples(cfg.n_iter_per_frame * cfg.win_size, cfg.n_samples_per_frame,\n                                                 cam_info.rays_dir_cache)\n                # merge first two dims, sample_per_frame*num_per_frame\n                Batch_N_gt_depth.append(gt_depth.reshape([gt_depth.shape[0] * gt_depth.shape[1]]))\n                Batch_N_gt_rgb.append(gt_rgb.reshape([gt_rgb.shape[0] * gt_rgb.shape[1], gt_rgb.shape[2]]))\n                Batch_N_depth_mask.append(valid_depth_mask)\n                Batch_N_obj_mask.append(obj_mask)\n                Batch_N_input_pcs.append(input_pcs.reshape([input_pcs.shape[0] * input_pcs.shape[1], input_pcs.shape[2], input_pcs.shape[3]]))\n                Batch_N_sampled_z.append(sampled_z.reshape([sampled_z.shape[0] * sampled_z.shape[1], sampled_z.shape[2]]))\n\n                # # vis sampled points in open3D\n                # # sampled pcs\n                # pc = open3d.geometry.PointCloud()\n                # pc.points = open3d.utility.Vector3dVector(input_pcs.cpu().numpy().reshape(-1,3))\n                # open3d.visualization.draw_geometries([pc])\n                # rgb_np = rgb.cpu().numpy().astype(np.uint8).transpose(1,0,2)\n                # # print(\"rgb \", rgb_np.shape)\n                # # print(rgb_np)\n                # # cv2.imshow(\"rgb\", rgb_np)\n                # # cv2.waitKey(1)\n                # depth_np = depth.cpu().numpy().astype(np.float32).transpose(1,0)\n                # twc_np = twc.cpu().numpy()\n                # rgbd = open3d.geometry.RGBDImage.create_from_color_and_depth(\n                #     open3d.geometry.Image(rgb_np),\n                #     open3d.geometry.Image(depth_np),\n                #     depth_trunc=max_depth,\n                #     depth_scale=1,\n                #     convert_rgb_to_intensity=False,\n                # )\n                # T_CW = np.linalg.inv(twc_np)\n                # # input image pc\n                # input_pc = open3d.geometry.PointCloud.create_from_rgbd_image(\n                #     image=rgbd,\n                #     intrinsic=intrinsic_open3d,\n                #     extrinsic=T_CW)\n                # input_pc.points = open3d.utility.Vector3dVector(np.array(input_pc.points) - obj_k.obj_center.cpu().numpy())\n                # open3d.visualization.draw_geometries([pc, input_pc])\n\n\n        ####################################################\n        # training\n        assert len(Batch_N_input_pcs) > 0\n        # move data to GPU  (n_obj, n_iter_per_frame, win_size*num_per_frame, 3)\n        with performance_measure(f\"stacking and moving to gpu: \"):\n\n            Batch_N_input_pcs = torch.stack(Batch_N_input_pcs).to(cfg.training_device)\n            Batch_N_gt_depth = torch.stack(Batch_N_gt_depth).to(cfg.training_device)\n            Batch_N_gt_rgb = torch.stack(Batch_N_gt_rgb).to(cfg.training_device) / 255. # todo\n            Batch_N_depth_mask = torch.stack(Batch_N_depth_mask).to(cfg.training_device)\n            Batch_N_obj_mask = torch.stack(Batch_N_obj_mask).to(cfg.training_device)\n            Batch_N_sampled_z = torch.stack(Batch_N_sampled_z).to(cfg.training_device)\n            if cfg.do_bg:\n                bg_input_pcs = bg_input_pcs.to(cfg.training_device)\n                bg_gt_depth = bg_gt_depth.to(cfg.training_device)\n                bg_gt_rgb = bg_gt_rgb.to(cfg.training_device) / 255.\n                bg_valid_depth_mask = bg_valid_depth_mask.to(cfg.training_device)\n                bg_obj_mask = bg_obj_mask.to(cfg.training_device)\n                bg_sampled_z = bg_sampled_z.to(cfg.training_device)\n\n        with performance_measure(f\"Training over {len(obj_dict.keys())} objects,\"):\n            for iter_step in range(cfg.n_iter_per_frame):\n                data_idx = slice(iter_step*n_sample_per_step, (iter_step+1)*n_sample_per_step)\n                batch_input_pcs = Batch_N_input_pcs[:, data_idx, ...]\n                batch_gt_depth = Batch_N_gt_depth[:, data_idx, ...]\n                batch_gt_rgb = Batch_N_gt_rgb[:, data_idx, ...]\n                batch_depth_mask = Batch_N_depth_mask[:, data_idx, ...]\n                batch_obj_mask = Batch_N_obj_mask[:, data_idx, ...]\n                batch_sampled_z = Batch_N_sampled_z[:, data_idx, ...]\n                if cfg.training_strategy == \"forloop\":\n                    # for loop training\n                    batch_alpha = []\n                    batch_color = []\n                    for k, obj_id in enumerate(obj_dict.keys()):\n                        obj_k = obj_dict[obj_id]\n                        embedding_k = obj_k.trainer.pe(batch_input_pcs[k])\n                        alpha_k, color_k = obj_k.trainer.fc_occ_map(embedding_k)\n                        batch_alpha.append(alpha_k)\n                        batch_color.append(color_k)\n\n                    batch_alpha = torch.stack(batch_alpha)\n                    batch_color = torch.stack(batch_color)\n                elif cfg.training_strategy == \"vmap\":\n                    # batched training\n                    batch_embedding = vmap(pe_model)(pe_param, pe_buffer, batch_input_pcs)\n                    batch_alpha, batch_color = vmap(fc_model)(fc_param, fc_buffer, batch_embedding)\n                    # print(\"batch alpha \", batch_alpha.shape)\n                else:\n                    print(\"training strategy {} is not implemented \".format(cfg.training_strategy))\n                    exit(-1)\n\n\n            # step loss\n            # with performance_measure(f\"Batch LOSS\"):\n                batch_loss, _ = loss.step_batch_loss(batch_alpha, batch_color,\n                                     batch_gt_depth.detach(), batch_gt_rgb.detach(),\n                                     batch_obj_mask.detach(), batch_depth_mask.detach(),\n                                     batch_sampled_z.detach())\n\n                if cfg.do_bg:\n                    bg_data_idx = slice(iter_step * n_sample_per_step_bg, (iter_step + 1) * n_sample_per_step_bg)\n                    bg_embedding = scene_bg.trainer.pe(bg_input_pcs[bg_data_idx, ...])\n                    bg_alpha, bg_color = scene_bg.trainer.fc_occ_map(bg_embedding)\n                    bg_loss, _ = loss.step_batch_loss(bg_alpha[None, ...], bg_color[None, ...],\n                                                     bg_gt_depth[None, bg_data_idx, ...].detach(), bg_gt_rgb[None, bg_data_idx].detach(),\n                                                     bg_obj_mask[None, bg_data_idx, ...].detach(), bg_valid_depth_mask[None, bg_data_idx, ...].detach(),\n                                                     bg_sampled_z[None, bg_data_idx, ...].detach())\n                    batch_loss += bg_loss\n\n            # with performance_measure(f\"Backward\"):\n                if AMP:\n                    scaler.scale(batch_loss).backward()\n                    scaler.step(optimiser)\n                    scaler.update()\n                else:\n                    batch_loss.backward()\n                    optimiser.step()\n                optimiser.zero_grad(set_to_none=True)\n                # print(\"loss \", batch_loss.item())\n\n        # update each origin model params\n        # todo find a better way    # https://github.com/pytorch/functorch/issues/280\n        with performance_measure(f\"updating vmap param\"):\n            if cfg.training_strategy == \"vmap\":\n                with torch.no_grad():\n                    for model_id, (obj_id, obj_k) in enumerate(obj_dict.items()):\n                        for i, param in enumerate(obj_k.trainer.fc_occ_map.parameters()):\n                            param.copy_(fc_param[i][model_id])\n                        for i, param in enumerate(obj_k.trainer.pe.parameters()):\n                            param.copy_(pe_param[i][model_id])\n\n\n        ####################################################################\n        # live vis mesh\n        if (((frame_id % cfg.n_vis_iter) == 0 or frame_id == dataset_len-1) or\n            (cfg.live_mode and time.time()-last_frame_time>cfg.keep_live_time)) and frame_id >= 10:\n            vis3d.clear_geometries()\n            for obj_id, obj_k in vis_dict.items():\n                bound = obj_k.get_bound(intrinsic_open3d)\n                if bound is None:\n                    print(\"get bound failed obj \", obj_id)\n                    continue\n                adaptive_grid_dim = int(np.minimum(np.max(bound.extent)//cfg.live_voxel_size+1, cfg.grid_dim))\n                mesh = obj_k.trainer.meshing(bound, obj_k.obj_center, grid_dim=adaptive_grid_dim)\n                if mesh is None:\n                    print(\"meshing failed obj \", obj_id)\n                    continue\n\n                # save to dir\n                obj_mesh_output = os.path.join(log_dir, \"scene_mesh\")\n                os.makedirs(obj_mesh_output, exist_ok=True)\n                mesh.export(os.path.join(obj_mesh_output, \"frame_{}_obj{}.obj\".format(frame_id, str(obj_id))))\n\n                # live vis\n                open3d_mesh = vis.trimesh_to_open3d(mesh)\n                vis3d.add_geometry(open3d_mesh)\n                vis3d.add_geometry(bound)\n                # update vis3d\n                vis3d.poll_events()\n                vis3d.update_renderer()\n\n        if False:    # follow cam\n            cam = view_ctl.convert_to_pinhole_camera_parameters()\n            T_CW_np = np.linalg.inv(twc.cpu().numpy())\n            cam.extrinsic = T_CW_np\n            view_ctl.convert_from_pinhole_camera_parameters(cam)\n            vis3d.poll_events()\n            vis3d.update_renderer()\n\n        with performance_measure(\"saving ckpt\"):\n            if save_ckpt and ((((frame_id % cfg.n_vis_iter) == 0 or frame_id == dataset_len - 1) or\n                               (cfg.live_mode and time.time() - last_frame_time > cfg.keep_live_time)) and frame_id >= 10):\n                for obj_id, obj_k in vis_dict.items():\n                    ckpt_dir = os.path.join(log_dir, \"ckpt\", str(obj_id))\n                    os.makedirs(ckpt_dir, exist_ok=True)\n                    bound = obj_k.get_bound(intrinsic_open3d)   # update bound\n                    obj_k.save_checkpoints(ckpt_dir, frame_id)\n                # save current cam pose\n                cam_dir = os.path.join(log_dir, \"cam_pose\")\n                os.makedirs(cam_dir, exist_ok=True)\n                torch.save({\"twc\": twc,}, os.path.join(cam_dir, \"twc_frame_{}\".format(frame_id) + \".pth\"))\n\n\n"
  },
  {
    "path": "trainer.py",
    "content": "import torch\nimport model\nimport embedding\nimport render_rays\nimport numpy as np\nimport vis\nfrom tqdm import tqdm\n\nclass Trainer:\n    def __init__(self, cfg):\n        self.obj_id = cfg.obj_id\n        self.device = cfg.training_device\n        self.hidden_feature_size = cfg.hidden_feature_size #32 for obj  # 256 for iMAP, 128 for seperate bg\n        self.obj_scale = cfg.obj_scale # 10 for bg and iMAP\n        self.n_unidir_funcs = cfg.n_unidir_funcs\n        self.emb_size1 = 21*(3+1)+3\n        self.emb_size2 = 21*(5+1)+3 - self.emb_size1\n\n        self.load_network()\n\n        if self.obj_id == 0:\n            self.bound_extent = 0.995\n        else:\n            self.bound_extent = 0.9\n\n    def load_network(self):\n        self.fc_occ_map = model.OccupancyMap(\n            self.emb_size1,\n            self.emb_size2,\n            hidden_size=self.hidden_feature_size\n        )\n        self.fc_occ_map.apply(model.init_weights).to(self.device)\n        self.pe = embedding.UniDirsEmbed(max_deg=self.n_unidir_funcs, scale=self.obj_scale).to(self.device)\n\n    def meshing(self, bound, obj_center, grid_dim=256):\n        occ_range = [-1., 1.]\n        range_dist = occ_range[1] - occ_range[0]\n        scene_scale_np = bound.extent / (range_dist * self.bound_extent)\n        scene_scale = torch.from_numpy(scene_scale_np).float().to(self.device)\n        transform_np = np.eye(4, dtype=np.float32)\n        transform_np[:3, 3] = bound.center\n        transform_np[:3, :3] = bound.R\n        # transform_np = np.linalg.inv(transform_np)  #\n        transform = torch.from_numpy(transform_np).to(self.device)\n        grid_pc = render_rays.make_3D_grid(occ_range=occ_range, dim=grid_dim, device=self.device,\n                                           scale=scene_scale, transform=transform).view(-1, 3)\n        grid_pc -= obj_center.to(grid_pc.device)\n        ret = self.eval_points(grid_pc)\n        if ret is None:\n            return None\n\n        occ, _ = ret\n        mesh = vis.marching_cubes(occ.view(grid_dim, grid_dim, grid_dim).cpu().numpy())\n        if mesh is None:\n            print(\"marching cube failed\")\n            return None\n\n        # Transform to [-1, 1] range\n        mesh.apply_translation([-0.5, -0.5, -0.5])\n        mesh.apply_scale(2)\n\n        # Transform to scene coordinates\n        mesh.apply_scale(scene_scale_np)\n        mesh.apply_transform(transform_np)\n\n        vertices_pts = torch.from_numpy(np.array(mesh.vertices)).float().to(self.device)\n        ret = self.eval_points(vertices_pts)\n        if ret is None:\n            return None\n        _, color = ret\n        mesh_color = color * 255\n        vertex_colors = mesh_color.detach().squeeze(0).cpu().numpy().astype(np.uint8)\n        mesh.visual.vertex_colors = vertex_colors\n\n        return mesh\n\n    def eval_points(self, points, chunk_size=100000):\n        # 256^3 = 16777216\n        alpha, color = [], []\n        n_chunks = int(np.ceil(points.shape[0] / chunk_size))\n        with torch.no_grad():\n            for k in tqdm(range(n_chunks)): # 2s/it 1000000 pts\n                chunk_idx = slice(k * chunk_size, (k + 1) * chunk_size)\n                embedding_k = self.pe(points[chunk_idx, ...])\n                alpha_k, color_k = self.fc_occ_map(embedding_k)\n                alpha.extend(alpha_k.detach().squeeze())\n                color.extend(color_k.detach().squeeze())\n        alpha = torch.stack(alpha)\n        color = torch.stack(color)\n\n        occ = render_rays.occupancy_activation(alpha).detach()\n        if occ.max() == 0:\n            print(\"no occ\")\n            return None\n        return (occ, color)\n\n\n\n\n\n\n\n\n\n\n\n"
  },
  {
    "path": "utils.py",
    "content": "import cv2\nimport imgviz\nimport numpy as np\nimport torch\nfrom functorch import combine_state_for_ensemble\nimport open3d\nimport queue\nimport copy\nimport torch.utils.dlpack\n\nclass BoundingBox():\n    def __init__(self):\n        super(BoundingBox, self).__init__()\n        self.extent = None\n        self.R = None\n        self.center = None\n        self.points3d = None    # (8,3)\n\ndef bbox_open3d2bbox(bbox_o3d):\n    bbox = BoundingBox()\n    bbox.extent = bbox_o3d.extent\n    bbox.R = bbox_o3d.R\n    bbox.center = bbox_o3d.center\n    return bbox\n\ndef bbox_bbox2open3d(bbox):\n    bbox_o3d = open3d.geometry.OrientedBoundingBox(bbox.center, bbox.R, bbox.extent)\n    return bbox_o3d\n\ndef update_vmap(models, optimiser):\n    fmodel, params, buffers = combine_state_for_ensemble(models)\n    [p.requires_grad_() for p in params]\n    optimiser.add_param_group({\"params\": params})  # imap b l\n    return (fmodel, params, buffers)\n\ndef enlarge_bbox(bbox, scale, w, h):\n    assert scale >= 0\n    # print(bbox)\n    min_x, min_y, max_x, max_y = bbox\n    margin_x = int(0.5 * scale * (max_x - min_x))\n    margin_y = int(0.5 * scale * (max_y - min_y))\n    if margin_y == 0 or margin_x == 0:\n        return None\n    # assert margin_x != 0\n    # assert margin_y != 0\n    min_x -= margin_x\n    max_x += margin_x\n    min_y -= margin_y\n    max_y += margin_y\n\n    min_x = np.clip(min_x, 0, w-1)\n    min_y = np.clip(min_y, 0, h-1)\n    max_x = np.clip(max_x, 0, w-1)\n    max_y = np.clip(max_y, 0, h-1)\n\n    bbox_enlarged = [int(min_x), int(min_y), int(max_x), int(max_y)]\n    return bbox_enlarged\n\ndef get_bbox2d(obj_mask, bbox_scale=1.0):\n    contours, hierarchy = cv2.findContours(obj_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[\n                          -2:]\n    # # Find the index of the largest contour\n    # areas = [cv2.contourArea(c) for c in contours]\n    # max_index = np.argmax(areas)\n    # cnt = contours[max_index]\n    # Concatenate all contours\n    if len(contours) == 0:\n        return None\n    cnt = np.concatenate(contours)\n    x, y, w, h = cv2.boundingRect(cnt)  # todo if multiple contours, choose the outmost one?\n    # x, y, w, h = cv2.boundingRect(contours)\n    bbox_enlarged = enlarge_bbox([x, y, x + w, y + h], scale=bbox_scale, w=obj_mask.shape[1], h=obj_mask.shape[0])\n    return bbox_enlarged\n\ndef get_bbox2d_batch(img):\n    b,h,w = img.shape[:3]\n    rows = torch.any(img, axis=2)\n    cols = torch.any(img, axis=1)\n    rmins = torch.argmax(rows.float(), dim=1)\n    rmaxs = h - torch.argmax(rows.float().flip(dims=[1]), dim=1)\n    cmins = torch.argmax(cols.float(), dim=1)\n    cmaxs = w - torch.argmax(cols.float().flip(dims=[1]), dim=1)\n\n    return rmins, rmaxs, cmins, cmaxs\n\ndef get_latest_queue(q):\n    message = None\n    while(True):\n        try:\n            message_latest = q.get(block=False)\n            if message is not None:\n                del message\n            message = message_latest\n\n        except queue.Empty:\n            break\n\n    return message\n\n# for association/tracking\nclass InstData:\n    def __init__(self):\n        super(InstData, self).__init__()\n        self.bbox3D = None\n        self.inst_id = None     # instance\n        self.class_id = None    # semantic\n        self.pc_sample = None\n        self.merge_cnt = 0  # merge times counting\n        self.cmp_cnt = 0\n\n\ndef box_filter(masks, classes, depth, inst_dict, intrinsic_open3d, T_CW, min_pixels=500, voxel_size=0.01):\n    bbox3d_scale = 1.0  # 1.05\n    inst_data = np.zeros_like(depth, dtype=np.int)\n    for i in range(len(masks)):\n        diff_mask = None\n        inst_mask = masks[i]\n        inst_id = classes[i]\n        if inst_id == 0:\n            continue\n        inst_depth = np.copy(depth)\n        inst_depth[~inst_mask] = 0.  # inst_mask\n        # proj_time = time.time()\n        inst_pc = unproject_pointcloud(inst_depth, intrinsic_open3d, T_CW)\n        # print(\"proj time \", time.time()-proj_time)\n        if len(inst_pc.points) <= 10:  # too small\n            inst_data[inst_mask] = 0  # set to background\n            continue\n        if inst_id in inst_dict.keys():\n            candidate_inst = inst_dict[inst_id]\n            # iou_time = time.time()\n            IoU, indices = check_inside_ratio(inst_pc, candidate_inst.bbox3D)\n            # print(\"iou time \", time.time()-iou_time)\n            # if indices empty\n            candidate_inst.cmp_cnt += 1\n            if len(indices) >= 1:\n                candidate_inst.pc += inst_pc.select_by_index(indices)  # only merge pcs inside scale*bbox\n                # todo check indices follow valid depth\n                valid_depth_mask = np.zeros_like(inst_depth, dtype=np.bool)\n                valid_pc_mask = valid_depth_mask[inst_depth!=0]\n                valid_pc_mask[indices] = True\n                valid_depth_mask[inst_depth != 0] = valid_pc_mask\n                valid_mask = valid_depth_mask\n                diff_mask = np.zeros_like(inst_mask)\n                # uv_opencv, _ = cv2.projectPoints(np.array(inst_pc.select_by_index(indices).points), T_CW[:3, :3],\n                #                                  T_CW[:3, 3], intrinsic_open3d.intrinsic_matrix[:3, :3], None)\n                # uv = np.round(uv_opencv).squeeze().astype(int)\n                # u = uv[:, 0].reshape(-1, 1)\n                # v = uv[:, 1].reshape(-1, 1)\n                # vu = np.concatenate([v, u], axis=-1)\n                # valid_mask = np.zeros_like(inst_mask)\n                # valid_mask[tuple(vu.T)] = True\n                # # cv2.imshow(\"valid\", (inst_depth!=0).astype(np.uint8)*255)\n                # # cv2.waitKey(1)\n                diff_mask[(inst_depth != 0) & (~valid_mask)] = True\n                # cv2.imshow(\"diff_mask\", diff_mask.astype(np.uint8) * 255)\n                # cv2.waitKey(1)\n            else:   # merge all for scannet\n                # print(\"too few pcs obj \", inst_id)\n                inst_data[inst_mask] = -1\n                continue\n            # downsample_time = time.time()\n            # adapt_voxel_size = np.maximum(np.max(candidate_inst.bbox3D.extent)/100, 0.1)\n            candidate_inst.pc = candidate_inst.pc.voxel_down_sample(voxel_size) # adapt_voxel_size\n            # candidate_inst.pc = candidate_inst.pc.farthest_point_down_sample(500)\n            # candidate_inst.pc = candidate_inst.pc.random_down_sample(np.minimum(len(candidate_inst.pc.points)/500.,1))\n            # print(\"downsample time \", time.time() - downsample_time)  # 0.03s even\n            # bbox_time = time.time()\n            try:\n                candidate_inst.bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(candidate_inst.pc.points)\n            except RuntimeError:\n                # print(\"too few pcs obj \", inst_id)\n                inst_data[inst_mask] = -1\n                continue\n            # enlarge\n            candidate_inst.bbox3D.scale(bbox3d_scale, candidate_inst.bbox3D.get_center())\n        else:   # new inst\n            # init new inst and new sem\n            new_inst = InstData()\n            new_inst.inst_id = inst_id\n            smaller_mask = cv2.erode(inst_mask.astype(np.uint8), np.ones((5, 5)), iterations=3).astype(bool)\n            if np.sum(smaller_mask) < min_pixels:\n                # print(\"too few pcs obj \", inst_id)\n                inst_data[inst_mask] = 0\n                continue\n            inst_depth_small = depth.copy()\n            inst_depth_small[~smaller_mask] = 0\n            inst_pc_small = unproject_pointcloud(inst_depth_small, intrinsic_open3d, T_CW)\n            new_inst.pc = inst_pc_small\n            new_inst.pc = new_inst.pc.voxel_down_sample(voxel_size)\n            try:\n                inst_bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(new_inst.pc.points)\n            except RuntimeError:\n                # print(\"too few pcs obj \", inst_id)\n                inst_data[inst_mask] = 0\n                continue\n            # scale up\n            inst_bbox3D.scale(bbox3d_scale, inst_bbox3D.get_center())\n            new_inst.bbox3D = inst_bbox3D\n            # update inst_dict\n            inst_dict.update({inst_id: new_inst})  # init new sem\n\n        # update inst_data\n        inst_data[inst_mask] = inst_id\n        if diff_mask is not None:\n            inst_data[diff_mask] = -1  # unsure area\n\n    return inst_data\n\ndef load_matrix_from_txt(path, shape=(4, 4)):\n    with open(path) as f:\n        txt = f.readlines()\n    txt = ''.join(txt).replace('\\n', ' ')\n    matrix = [float(v) for v in txt.split()]\n    return np.array(matrix).reshape(shape)\n\ndef check_mask_order(obj_masks, depth_np, obj_ids):\n    print(len(obj_masks))\n    print(len(obj_ids))\n\n    assert len(obj_masks) == len(obj_ids)\n    depth = torch.from_numpy(depth_np)\n    obj_masked_modified = copy.deepcopy(obj_masks[:])\n    for i in range(len(obj_masks) - 1):\n\n        mask1 = obj_masks[i].float()\n        mask1_ = obj_masked_modified[i].float()\n        for j in range(i + 1, len(obj_masks)):\n            mask2 = obj_masks[j].float()\n            mask2_ = obj_masked_modified[j].float()\n            # case 1: if they don't intersect we don't touch them\n            if ((mask1 + mask2) == 2).sum() == 0:\n                continue\n            # case 2: the entire object 1 is inside of object 2, we say object 1 is in front of object 2:\n            elif (((mask1 + mask2) == 2).float() - mask1).sum() == 0:\n                mask2_ -= mask1_\n            # case 3: the entire object 2 is inside of object 1, we say object 2 is in front of object 1:\n            elif (((mask1 + mask2) == 2).float() - mask2).sum() == 0:\n                mask1_ -= mask2_\n            # case 4: use depth to check object order:\n            else:\n                # object 1 is closer\n                if (depth * mask1).sum() / mask1.sum() > (depth * mask2).sum() / mask2.sum():\n                    mask2_ -= ((mask1 + mask2) == 2).float()\n                # object 2 is closer\n                if (depth * mask1).sum() / mask1.sum() < (depth * mask2).sum() / mask2.sum():\n                    mask1_ -= ((mask1 + mask2) == 2).float()\n\n    final_mask = torch.zeros_like(depth, dtype=torch.int)\n    # instance_labels = {}\n    for i in range(len(obj_masked_modified)):\n        final_mask = final_mask.masked_fill(obj_masked_modified[i] > 0, obj_ids[i])\n        # instance_labels[i] = obj_ids[i].item()\n    return final_mask.cpu().numpy()\n\n\ndef unproject_pointcloud(depth, intrinsic_open3d, T_CW):\n    # depth, mask, intrinsic, extrinsic -> point clouds\n    pc_sample = open3d.geometry.PointCloud.create_from_depth_image(depth=open3d.geometry.Image(depth),\n                                                                   intrinsic=intrinsic_open3d,\n                                                                   extrinsic=T_CW,\n                                                                   depth_scale=1.0,\n                                                                   project_valid_depth_only=True)\n    return pc_sample\n\ndef check_inside_ratio(pc, bbox3D):\n    #  pc, bbox3d -> inside ratio\n    indices = bbox3D.get_point_indices_within_bounding_box(pc.points)\n    assert len(pc.points) > 0\n    ratio = len(indices) / len(pc.points)\n    # print(\"ratio \", ratio)\n    return ratio, indices\n\ndef track_instance(masks, classes, depth, inst_list, sem_dict, intrinsic_open3d, T_CW, IoU_thresh=0.5, voxel_size=0.1,\n                   min_pixels=2000, erode=True, clip_features=None, class_names=None):\n    device = masks.device\n    inst_data_dict = {}\n    inst_data_dict.update({0: torch.zeros(depth.shape, dtype=torch.int, device=device)})\n    inst_ids = []\n    bbox3d_scale = 1.0  # todo 1.0\n    min_extent = 0.05\n    depth = torch.from_numpy(depth).to(device)\n    for i in range(len(masks)):\n        inst_data = torch.zeros(depth.shape, dtype=torch.int, device=device)\n        smaller_mask = cv2.erode(masks[i].detach().cpu().numpy().astype(np.uint8), np.ones((5, 5)), iterations=3).astype(bool)\n        inst_depth_small = depth.detach().cpu().numpy()\n        inst_depth_small[~smaller_mask] = 0\n        inst_pc_small = unproject_pointcloud(inst_depth_small, intrinsic_open3d, T_CW)\n        diff_mask = None\n        if np.sum(smaller_mask) <= min_pixels:  # too small    20  400 # todo use sem to set background\n            inst_data[masks[i]] = 0  # set to background\n            continue\n        inst_pc_voxel = inst_pc_small.voxel_down_sample(voxel_size)\n        if len(inst_pc_voxel.points) <= 10:  # too small    20  400 # todo use sem to set background\n            inst_data[masks[i]] = 0  # set to background\n            continue\n        is_merged = False\n        inst_id = None\n        inst_mask = masks[i] #smaller_mask #masks[i]    # todo only\n        inst_class = classes[i]\n        inst_depth = depth.detach().cpu().numpy()\n        inst_depth[~masks[i].detach().cpu().numpy()] = 0.  # inst_mask\n        inst_pc = unproject_pointcloud(inst_depth, intrinsic_open3d, T_CW)\n        sem_inst_list = []\n        if clip_features is not None: # check similar sems based on clip feature distance\n            sem_thr = 200 #300. for table #320.  # 260.\n            for sem_exist in sem_dict.keys():\n                if torch.abs(clip_features[class_names[inst_class]] - clip_features[class_names[sem_exist]]).sum() < sem_thr:\n                    sem_inst_list.extend(sem_dict[sem_exist])\n        else:   # no clip features, only do strictly sem check\n            if inst_class in sem_dict.keys():\n                sem_inst_list.extend(sem_dict[inst_class])\n\n        for candidate_inst in sem_inst_list:\n    # if True:  # only consider 3D bbox, merge them if they are spatial together\n            IoU, indices = check_inside_ratio(inst_pc, candidate_inst.bbox3D)\n            candidate_inst.cmp_cnt += 1\n            if IoU > IoU_thresh:\n                # merge inst to candidate\n                is_merged = True\n                candidate_inst.merge_cnt += 1\n                candidate_inst.pc += inst_pc.select_by_index(indices)\n                # inst_uv = inst_pc.select_by_index(indices).project_to_depth_image(masks[i].shape[1], masks[i].shape[0], intrinsic_open3d, T_CW, depth_scale=1.0, depth_max=10.0)\n                # # inst_uv = torch.utils.dlpack.from_dlpack(uv_opencv.as_tensor().to_dlpack())\n                # valid_mask = inst_uv.squeeze() > 0.  # shape --> H, W\n                # diff_mask = (inst_depth > 0.) & (~valid_mask)\n                diff_mask = torch.zeros_like(inst_mask)\n                uv_opencv, _ = cv2.projectPoints(np.array(inst_pc.select_by_index(indices).points), T_CW[:3, :3],\n                                                 T_CW[:3, 3], intrinsic_open3d.intrinsic_matrix[:3,:3], None)\n                uv = np.round(uv_opencv).squeeze().astype(int)\n                u = uv[:, 0].reshape(-1, 1)\n                v = uv[:, 1].reshape(-1, 1)\n                vu = np.concatenate([v, u], axis=-1)\n                valid_mask = np.zeros(inst_mask.shape, dtype=np.bool)\n                valid_mask[tuple(vu.T)] = True\n                diff_mask[(inst_depth!=0) & (~valid_mask)] = True\n                # downsample pcs\n                candidate_inst.pc = candidate_inst.pc.voxel_down_sample(voxel_size)\n                # candidate_inst.pc.random_down_sample(np.minimum(500//len(candidate_inst.pc.points),1))\n                candidate_inst.bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(candidate_inst.pc.points)\n                # enlarge\n                candidate_inst.bbox3D.scale(bbox3d_scale, candidate_inst.bbox3D.get_center())\n                candidate_inst.bbox3D.extent = np.maximum(candidate_inst.bbox3D.extent, min_extent) # at least bigger than min_extent\n                inst_id = candidate_inst.inst_id\n                break\n            # if candidate_inst.cmp_cnt >= 20 and candidate_inst.merge_cnt <= 5:\n            #     sem_inst_list.remove(candidate_inst)\n\n        if not is_merged:\n            # init new inst and new sem\n            new_inst = InstData()\n            new_inst.inst_id = len(inst_list) + 1\n            new_inst.class_id = inst_class\n\n            new_inst.pc = inst_pc_small\n            new_inst.pc = new_inst.pc.voxel_down_sample(voxel_size)\n            inst_bbox3D = open3d.geometry.OrientedBoundingBox.create_from_points(new_inst.pc.points)\n            # scale up\n            inst_bbox3D.scale(bbox3d_scale, inst_bbox3D.get_center())\n            inst_bbox3D.extent = np.maximum(inst_bbox3D.extent, min_extent)\n            new_inst.bbox3D = inst_bbox3D\n            inst_list.append(new_inst)\n            inst_id = new_inst.inst_id\n            # update sem_dict\n            if inst_class in sem_dict.keys():\n                sem_dict[inst_class].append(new_inst)   # append new inst to exist sem\n            else:\n                sem_dict.update({inst_class: [new_inst]})   # init new sem\n        # update inst_data\n        inst_data[inst_mask] = inst_id\n        if diff_mask is not None:\n            inst_data[diff_mask] = -1   # unsure area\n        if inst_id not in inst_ids:\n            inst_data_dict.update({inst_id: inst_data})\n        else:\n            continue\n            # idx = inst_ids.index(inst_id)\n            # inst_data_list[idx] = inst_data_list[idx] & torch.from_numpy(inst_data) # merge them? todo\n    # return inst_data\n    mask_bg = torch.stack(list(inst_data_dict.values())).sum(0) != 0\n    inst_data_dict.update({0: mask_bg.int()})\n    return inst_data_dict\n"
  },
  {
    "path": "vis.py",
    "content": "import skimage.measure\nimport trimesh\nimport open3d as o3d\nimport numpy as np\n\ndef marching_cubes(occupancy, level=0.5):\n    try:\n        vertices, faces, vertex_normals, _ = skimage.measure.marching_cubes(    #marching_cubes_lewiner(    #marching_cubes(\n            occupancy, level=level, gradient_direction='ascent')\n    except (RuntimeError, ValueError):\n       return None\n\n    dim = occupancy.shape[0]\n    vertices = vertices / (dim - 1)\n    mesh = trimesh.Trimesh(vertices=vertices,\n                           vertex_normals=vertex_normals,\n                           faces=faces)\n\n    return mesh\n\ndef trimesh_to_open3d(src):\n    dst = o3d.geometry.TriangleMesh()\n    dst.vertices = o3d.utility.Vector3dVector(src.vertices)\n    dst.triangles = o3d.utility.Vector3iVector(src.faces)\n    vertex_colors = src.visual.vertex_colors[:, :3].astype(np.float) / 255.0\n    dst.vertex_colors = o3d.utility.Vector3dVector(vertex_colors)\n    dst.compute_vertex_normals()\n\n    return dst"
  },
  {
    "path": "vmap.py",
    "content": "import random\nimport numpy as np\nimport torch\nfrom time import perf_counter_ns\nfrom tqdm import tqdm\nimport trainer\nimport open3d\nimport trimesh\nimport scipy\nfrom bidict import bidict\nimport copy\nimport os\n\nimport utils\n\n\nclass performance_measure:\n\n    def __init__(self, name) -> None:\n        self.name = name\n\n    def __enter__(self):\n        self.start_time = perf_counter_ns()\n\n    def __exit__(self, type, value, tb):\n        self.end_time = perf_counter_ns()\n        self.exec_time = self.end_time - self.start_time\n\n        print(f\"{self.name} excution time: {(self.exec_time)/1000000:.2f} ms\")\n\ndef origin_dirs_W(T_WC, dirs_C):\n\n    assert T_WC.shape[0] == dirs_C.shape[0]\n    assert T_WC.shape[1:] == (4, 4)\n    assert dirs_C.shape[2] == 3\n\n    dirs_W = (T_WC[:, None, :3, :3] @ dirs_C[..., None]).squeeze()\n\n    origins = T_WC[:, :3, -1]\n\n    return origins, dirs_W\n\n\n# @torch.jit.script\ndef stratified_bins(min_depth, max_depth, n_bins, n_rays, type=torch.float32, device = \"cuda:0\"):\n    # type: (Tensor, Tensor, int, int) -> Tensor\n\n    bin_limits_scale = torch.linspace(0, 1, n_bins+1, dtype=type, device=device)\n\n    if not torch.is_tensor(min_depth):\n        min_depth = torch.ones(n_rays, dtype=type, device=device) * min_depth\n    \n    if not torch.is_tensor(max_depth):\n        max_depth = torch.ones(n_rays, dtype=type, device=device) * max_depth\n\n    depth_range = max_depth - min_depth\n  \n    lower_limits_scale = depth_range[..., None] * bin_limits_scale + min_depth[..., None]\n    lower_limits_scale = lower_limits_scale[:, :-1]\n\n    assert lower_limits_scale.shape == (n_rays, n_bins)\n\n    bin_length_scale = depth_range / n_bins\n    increments_scale = torch.rand(\n        n_rays, n_bins, device=device,\n        dtype=torch.float32) * bin_length_scale[..., None]\n\n    z_vals_scale = lower_limits_scale + increments_scale\n\n    assert z_vals_scale.shape == (n_rays, n_bins)\n\n    return z_vals_scale\n\n# @torch.jit.script\ndef normal_bins_sampling(depth, n_bins, n_rays, delta, device = \"cuda:0\"):\n    # type: (Tensor, int, int, float) -> Tensor\n\n    # device = \"cpu\"\n    # bins = torch.normal(0.0, delta / 3., size=[n_rays, n_bins], devi\n        # self.keyframes_batch = torch.empty(self.n_keyframes,ce=device).sort().values\n    bins = torch.empty(n_rays, n_bins, dtype=torch.float32, device=device).normal_(mean=0.,std=delta / 3.).sort().values\n    bins = torch.clip(bins, -delta, delta)\n    z_vals = depth[:, None] + bins\n\n    assert z_vals.shape == (n_rays, n_bins)\n\n    return z_vals\n\n\nclass sceneObject:\n    \"\"\"\n    object instance mapping,\n    updating keyframes, get training samples, optimizing MLP map\n    \"\"\"\n\n    def __init__(self, cfg, obj_id, rgb:torch.tensor, depth:torch.tensor, mask:torch.tensor, bbox_2d:torch.tensor, t_wc:torch.tensor, live_frame_id) -> None:\n        self.do_bg = cfg.do_bg\n        self.obj_id = obj_id\n        self.data_device = cfg.data_device\n        self.training_device = cfg.training_device\n\n        assert rgb.shape[:2] == depth.shape\n        assert rgb.shape[:2] == mask.shape\n        assert bbox_2d.shape == (4,)\n        assert t_wc.shape == (4, 4,)\n\n        if self.do_bg and self.obj_id == 0: # do seperate bg\n            self.obj_scale = cfg.bg_scale\n            self.hidden_feature_size = cfg.hidden_feature_size_bg\n            self.n_bins_cam2surface = cfg.n_bins_cam2surface_bg\n            self.keyframe_step = cfg.keyframe_step_bg\n        else:\n            self.obj_scale = cfg.obj_scale\n            self.hidden_feature_size = cfg.hidden_feature_size\n            self.n_bins_cam2surface = cfg.n_bins_cam2surface\n            self.keyframe_step = cfg.keyframe_step\n\n        self.frames_width = rgb.shape[0]\n        self.frames_height = rgb.shape[1]\n\n        self.min_bound = cfg.min_depth\n        self.max_bound = cfg.max_depth\n        self.n_bins = cfg.n_bins\n        self.n_unidir_funcs = cfg.n_unidir_funcs\n\n        self.surface_eps = cfg.surface_eps\n        self.stop_eps = cfg.stop_eps\n\n        self.n_keyframes = 1  # Number of keyframes\n        self.kf_pointer = None\n        self.keyframe_buffer_size = cfg.keyframe_buffer_size\n        self.kf_id_dict = bidict({live_frame_id:0})\n        self.kf_buffer_full = False\n        self.frame_cnt = 0  # number of frames taken in\n        self.lastest_kf_queue = []\n\n        self.bbox = torch.empty(  # obj bounding bounding box in the frame\n            self.keyframe_buffer_size,\n            4,\n            device=self.data_device)  # [u low, u high, v low, v high]\n        self.bbox[0] = bbox_2d\n\n        # RGB + pixel state batch\n        self.rgb_idx = slice(0, 3)\n        self.state_idx = slice(3, 4)\n        self.rgbs_batch = torch.empty(self.keyframe_buffer_size,\n                                      self.frames_width,\n                                      self.frames_height,\n                                      4,\n                                      dtype=torch.uint8,\n                                      device=self.data_device)\n\n        # Pixel states:\n        self.other_obj = 0  # pixel doesn't belong to obj\n        self.this_obj = 1  # pixel belong to obj \n        self.unknown_obj = 2  # pixel state is unknown\n\n        # Initialize first frame rgb and pixel state\n        self.rgbs_batch[0, :, :, self.rgb_idx] = rgb\n        self.rgbs_batch[0, :, :, self.state_idx] = mask[..., None]\n\n        self.depth_batch = torch.empty(self.keyframe_buffer_size,\n                                       self.frames_width,\n                                       self.frames_height,\n                                       dtype=torch.float32,\n                                       device=self.data_device)\n\n        # Initialize first frame's depth \n        self.depth_batch[0] = depth\n        self.t_wc_batch = torch.empty(\n            self.keyframe_buffer_size, 4, 4,\n            dtype=torch.float32,\n            device=self.data_device)  # world to camera transform\n\n        # Initialize first frame's world2cam transform\n        self.t_wc_batch[0] = t_wc\n\n        # neural field map\n        trainer_cfg = copy.deepcopy(cfg)\n        trainer_cfg.obj_id = self.obj_id\n        trainer_cfg.hidden_feature_size = self.hidden_feature_size\n        trainer_cfg.obj_scale = self.obj_scale\n        self.trainer = trainer.Trainer(trainer_cfg)\n\n        # 3D boundary\n        self.bbox3d = None\n        self.pc = []\n\n        # init  obj local frame\n        # self.obj_center = self.init_obj_center(intrinsic, depth, mask, t_wc)\n        self.obj_center = torch.tensor(0.0) # shouldn't make any difference because of frequency embedding\n\n\n    def init_obj_center(self, intrinsic_open3d, depth, mask, t_wc):\n        obj_depth = depth.cpu().clone()\n        obj_depth[mask!=self.this_obj] = 0\n        T_CW = np.linalg.inv(t_wc.cpu().numpy())\n        pc_obj_init = open3d.geometry.PointCloud.create_from_depth_image(\n            depth=open3d.geometry.Image(np.asarray(obj_depth.permute(1,0).numpy(), order=\"C\")),\n            intrinsic=intrinsic_open3d,\n            extrinsic=T_CW,\n            depth_trunc=self.max_bound,\n            depth_scale=1.0)\n        obj_center = torch.from_numpy(np.mean(pc_obj_init.points, axis=0)).float()\n        return obj_center\n\n    # @profile\n    def append_keyframe(self, rgb:torch.tensor, depth:torch.tensor, mask:torch.tensor, bbox_2d:torch.tensor, t_wc:torch.tensor, frame_id:np.uint8=1):\n        assert rgb.shape[:2] == depth.shape\n        assert rgb.shape[:2] == mask.shape\n        assert bbox_2d.shape == (4,)\n        assert t_wc.shape == (4, 4,)\n        assert self.n_keyframes <= self.keyframe_buffer_size - 1\n        assert rgb.dtype == torch.uint8\n        assert mask.dtype == torch.uint8\n        assert depth.dtype == torch.float32\n\n        # every kf_step choose one kf\n        is_kf = (self.frame_cnt % self.keyframe_step == 0) or self.n_keyframes == 1\n        # print(\"---------------------\")\n        # print(\"self.kf_id_dict \", self.kf_id_dict)\n        # print(\"live frame id \", frame_id)\n        # print(\"n_frames \", self.n_keyframes)\n        if self.n_keyframes == self.keyframe_buffer_size - 1:  # kf buffer full, need to prune\n            self.kf_buffer_full = True\n            if self.kf_pointer is None:\n                self.kf_pointer = self.n_keyframes\n\n            self.rgbs_batch[self.kf_pointer, :, :, self.rgb_idx] = rgb\n            self.rgbs_batch[self.kf_pointer, :, :, self.state_idx] = mask[..., None]\n            self.depth_batch[self.kf_pointer, ...] = depth\n            self.t_wc_batch[self.kf_pointer, ...] = t_wc\n            self.bbox[self.kf_pointer, ...] = bbox_2d\n            self.kf_id_dict.inv[self.kf_pointer] = frame_id\n\n            if is_kf:\n                self.lastest_kf_queue.append(self.kf_pointer)\n                pruned_frame_id, pruned_kf_id = self.prune_keyframe()\n                self.kf_pointer = pruned_kf_id\n                print(\"pruned kf id \", self.kf_pointer)\n\n        else:\n            if not is_kf:   # not kf, replace\n                self.rgbs_batch[self.n_keyframes-1, :, :, self.rgb_idx] = rgb\n                self.rgbs_batch[self.n_keyframes-1, :, :, self.state_idx] = mask[..., None]\n                self.depth_batch[self.n_keyframes-1, ...] = depth\n                self.t_wc_batch[self.n_keyframes-1, ...] = t_wc\n                self.bbox[self.n_keyframes-1, ...] = bbox_2d\n                self.kf_id_dict.inv[self.n_keyframes-1] = frame_id\n            else:   # is kf, add new kf\n                self.kf_id_dict[frame_id] = self.n_keyframes\n                self.rgbs_batch[self.n_keyframes, :, :, self.rgb_idx] = rgb\n                self.rgbs_batch[self.n_keyframes, :, :, self.state_idx] = mask[..., None]\n                self.depth_batch[self.n_keyframes, ...] = depth\n                self.t_wc_batch[self.n_keyframes, ...] = t_wc\n                self.bbox[self.n_keyframes, ...] = bbox_2d\n                self.lastest_kf_queue.append(self.n_keyframes)\n                self.n_keyframes += 1\n\n        # print(\"self.kf_id_dic \", self.kf_id_dict)\n        self.frame_cnt += 1\n        if len(self.lastest_kf_queue) > 2:  # keep latest two frames\n            self.lastest_kf_queue = self.lastest_kf_queue[-2:]\n\n    def prune_keyframe(self):\n        # simple strategy to prune, randomly choose\n        key, value = random.choice(list(self.kf_id_dict.items())[:-2])  # do not prune latest two frames\n        return key, value\n\n    def get_bound(self, intrinsic_open3d):\n        # get 3D boundary from posed depth img   todo update sparse pc when append frame\n        pcs = open3d.geometry.PointCloud()\n        for kf_id in range(self.n_keyframes):\n            mask = self.rgbs_batch[kf_id, :, :, self.state_idx].squeeze() == self.this_obj\n            depth = self.depth_batch[kf_id].cpu().clone()\n            twc = self.t_wc_batch[kf_id].cpu().numpy()\n            depth[~mask] = 0\n            depth = depth.permute(1,0).numpy().astype(np.float32)\n            T_CW = np.linalg.inv(twc)\n            pc = open3d.geometry.PointCloud.create_from_depth_image(depth=open3d.geometry.Image(np.asarray(depth, order=\"C\")), intrinsic=intrinsic_open3d, extrinsic=T_CW)\n            # self.pc += pc\n            pcs += pc\n\n        # # get minimal oriented 3d bbox\n        # try:\n        #     bbox3d = open3d.geometry.OrientedBoundingBox.create_from_points(pcs.points)\n        # except RuntimeError:\n        #     print(\"too few pcs obj \")\n        #     return None\n        # trimesh has a better minimal bbox implementation than open3d\n        try:\n            transform, extents = trimesh.bounds.oriented_bounds(np.array(pcs.points))  # pc\n            transform = np.linalg.inv(transform)\n        except scipy.spatial._qhull.QhullError:\n            print(\"too few pcs obj \")\n            return None\n\n        for i in range(extents.shape[0]):\n            extents[i] = np.maximum(extents[i], 0.10)  # at least rendering 10cm\n        bbox = utils.BoundingBox()\n        bbox.center = transform[:3, 3]\n        bbox.R = transform[:3, :3]\n        bbox.extent = extents\n        bbox3d = open3d.geometry.OrientedBoundingBox(bbox.center, bbox.R, bbox.extent)\n\n        min_extent = 0.05\n        bbox3d.extent = np.maximum(min_extent, bbox3d.extent)\n        bbox3d.color = (255,0,0)\n        self.bbox3d = utils.bbox_open3d2bbox(bbox_o3d=bbox3d)\n        # self.pc = []\n        print(\"obj \", self.obj_id)\n        print(\"bound \", bbox3d)\n        print(\"kf id dict \", self.kf_id_dict)\n        # open3d.visualization.draw_geometries([bbox3d, pcs])\n        return bbox3d\n\n\n\n    def get_training_samples(self, n_frames, n_samples, cached_rays_dir):\n        # Sample pixels\n        if self.n_keyframes > 2: # make sure latest 2 frames are sampled    todo if kf pruned, this is not the latest frame\n            keyframe_ids = torch.randint(low=0,\n                                         high=self.n_keyframes,\n                                         size=(n_frames - 2,),\n                                         dtype=torch.long,\n                                         device=self.data_device)\n            # if self.kf_buffer_full:\n            # latest_frame_ids = list(self.kf_id_dict.values())[-2:]\n            latest_frame_ids = self.lastest_kf_queue[-2:]\n            keyframe_ids = torch.cat([keyframe_ids,\n                                          torch.tensor(latest_frame_ids, device=keyframe_ids.device)])\n            # print(\"latest_frame_ids\", latest_frame_ids)\n            # else:   # sample last 2 frames\n            #     keyframe_ids = torch.cat([keyframe_ids,\n            #                               torch.tensor([self.n_keyframes-2, self.n_keyframes-1], device=keyframe_ids.device)])\n        else:\n            keyframe_ids = torch.randint(low=0,\n                                         high=self.n_keyframes,\n                                         size=(n_frames,),\n                                         dtype=torch.long,\n                                         device=self.data_device)\n        keyframe_ids = torch.unsqueeze(keyframe_ids, dim=-1)\n        idx_w = torch.rand(n_frames, n_samples, device=self.data_device)\n        idx_h = torch.rand(n_frames, n_samples, device=self.data_device)\n\n        # resizing idx_w and idx_h to be in the bbox range\n        idx_w = idx_w * (self.bbox[keyframe_ids, 1] - self.bbox[keyframe_ids, 0]) + self.bbox[keyframe_ids, 0]\n        idx_h = idx_h * (self.bbox[keyframe_ids, 3] - self.bbox[keyframe_ids, 2]) + self.bbox[keyframe_ids, 2]\n\n        idx_w = idx_w.long()\n        idx_h = idx_h.long()\n\n        sampled_rgbs = self.rgbs_batch[keyframe_ids, idx_w, idx_h]\n        sampled_depth = self.depth_batch[keyframe_ids, idx_w, idx_h]\n\n        # Get ray directions for sampled pixels\n        sampled_ray_dirs = cached_rays_dir[idx_w, idx_h]\n\n        # Get sampled keyframe poses\n        sampled_twc = self.t_wc_batch[keyframe_ids[:, 0], :, :]\n\n        origins, dirs_w = origin_dirs_W(sampled_twc, sampled_ray_dirs)\n\n        return self.sample_3d_points(sampled_rgbs, sampled_depth, origins, dirs_w)\n\n    def sample_3d_points(self, sampled_rgbs, sampled_depth, origins, dirs_w):\n        \"\"\"\n        3D sampling strategy\n\n        * For pixels with invalid depth:\n            - N+M from minimum bound to max (stratified)\n        \n        * For pixels with valid depth:\n            # Pixel belongs to this object\n                - N from cam to surface (stratified)\n                - M around surface (stratified/normal)\n            # Pixel belongs that don't belong to this object\n                - N from cam to surface (stratified)\n                - M around surface (stratified)\n            # Pixel with unknown state\n                - Do nothing!\n        \"\"\"\n\n        n_bins_cam2surface = self.n_bins_cam2surface\n        n_bins = self.n_bins\n        eps = self.surface_eps\n        other_objs_max_eps = self.stop_eps #0.05   # todo 0.02\n        # print(\"max depth \", torch.max(sampled_depth))\n        sampled_z = torch.zeros(\n            sampled_rgbs.shape[0] * sampled_rgbs.shape[1],\n            n_bins_cam2surface + n_bins,\n            dtype=self.depth_batch.dtype,\n            device=self.data_device)  # shape (N*n_rays, n_bins_cam2surface + n_bins)\n\n        invalid_depth_mask = (sampled_depth <= self.min_bound).view(-1)\n        # max_bound = self.max_bound\n        max_bound = torch.max(sampled_depth)\n        # sampling for points with invalid depth\n        invalid_depth_count = invalid_depth_mask.count_nonzero()\n        if invalid_depth_count:\n            sampled_z[invalid_depth_mask, :] = stratified_bins(\n                self.min_bound, max_bound,\n                n_bins_cam2surface + n_bins, invalid_depth_count,\n                device=self.data_device)\n\n        # sampling for valid depth rays\n        valid_depth_mask = ~invalid_depth_mask\n        valid_depth_count = valid_depth_mask.count_nonzero()\n\n\n        if valid_depth_count:\n            # Sample between min bound and depth for all pixels with valid depth\n            sampled_z[valid_depth_mask, :n_bins_cam2surface] = stratified_bins(\n                self.min_bound, sampled_depth.view(-1)[valid_depth_mask]-eps,\n                n_bins_cam2surface, valid_depth_count, device=self.data_device)\n\n            # sampling around depth for this object\n            obj_mask = (sampled_rgbs[..., -1] == self.this_obj).view(-1) & valid_depth_mask # todo obj_mask\n            assert sampled_z.shape[0] == obj_mask.shape[0]\n            obj_count = obj_mask.count_nonzero()\n\n            if obj_count:\n                sampling_method = \"normal\"  # stratified or normal\n                if sampling_method == \"stratified\":\n                    sampled_z[obj_mask, n_bins_cam2surface:] = stratified_bins(\n                        sampled_depth.view(-1)[obj_mask] - eps, sampled_depth.view(-1)[obj_mask] + eps,\n                        n_bins, obj_count, device=self.data_device)\n\n                elif sampling_method == \"normal\":\n                    sampled_z[obj_mask, n_bins_cam2surface:] = normal_bins_sampling(\n                        sampled_depth.view(-1)[obj_mask],\n                        n_bins,\n                        obj_count,\n                        delta=eps,\n                        device=self.data_device)\n\n                else:\n                    raise (\n                        f\"sampling method not implemented {sampling_method}, \\\n                            stratified and normal sampling only currenty implemented.\"\n                    )\n\n            # sampling around depth of other objects\n            other_obj_mask = (sampled_rgbs[..., -1] != self.this_obj).view(-1) & valid_depth_mask\n            other_objs_count = other_obj_mask.count_nonzero()\n            if other_objs_count:\n                sampled_z[other_obj_mask, n_bins_cam2surface:] = stratified_bins(\n                    sampled_depth.view(-1)[other_obj_mask] - eps,\n                    sampled_depth.view(-1)[other_obj_mask] + other_objs_max_eps,\n                    n_bins, other_objs_count, device=self.data_device)\n\n        sampled_z = sampled_z.view(sampled_rgbs.shape[0],\n                                   sampled_rgbs.shape[1],\n                                   -1)  # view as (n_rays, n_samples, 10)\n        input_pcs = origins[..., None, None, :] + (dirs_w[:, :, None, :] *\n                                                   sampled_z[..., None])\n        input_pcs -= self.obj_center\n        obj_labels = sampled_rgbs[..., -1].view(-1)\n        return sampled_rgbs[..., :3], sampled_depth, valid_depth_mask, obj_labels, input_pcs, sampled_z\n\n    def save_checkpoints(self, path, epoch):\n        obj_id = self.obj_id\n        chechpoint_load_file = (path + \"/obj_\" + str(obj_id) + \"_frame_\" + str(epoch) + \".pth\")\n\n        torch.save(\n            {\n                \"epoch\": epoch,\n                \"FC_state_dict\": self.trainer.fc_occ_map.state_dict(),\n                \"PE_state_dict\": self.trainer.pe.state_dict(),\n                \"obj_id\": self.obj_id,\n                \"bbox\": self.bbox3d,\n                \"obj_scale\": self.trainer.obj_scale\n            },\n            chechpoint_load_file,\n        )\n        # optimiser?\n\n    def load_checkpoints(self, ckpt_file):\n        checkpoint_load_file = (ckpt_file)\n        if not os.path.exists(checkpoint_load_file):\n            print(\"ckpt not exist \", checkpoint_load_file)\n            return\n        checkpoint = torch.load(checkpoint_load_file)\n        self.trainer.fc_occ_map.load_state_dict(checkpoint[\"FC_state_dict\"])\n        self.trainer.pe.load_state_dict(checkpoint[\"PE_state_dict\"])\n        self.obj_id = checkpoint[\"obj_id\"]\n        self.bbox3d = checkpoint[\"bbox\"]\n        self.trainer.obj_scale = checkpoint[\"obj_scale\"]\n\n        self.trainer.fc_occ_map.to(self.training_device)\n        self.trainer.pe.to(self.training_device)\n\n\nclass cameraInfo:\n\n    def __init__(self, cfg) -> None:\n        self.device = cfg.data_device\n        self.width = cfg.W  # Frame width\n        self.height = cfg.H  # Frame height\n\n        self.fx = cfg.fx\n        self.fy = cfg.fy\n        self.cx = cfg.cx\n        self.cy = cfg.cy\n\n        self.rays_dir_cache = self.get_rays_dirs()\n\n    def get_rays_dirs(self, depth_type=\"z\"):\n        idx_w = torch.arange(end=self.width, device=self.device)\n        idx_h = torch.arange(end=self.height, device=self.device)\n\n        dirs = torch.ones((self.width, self.height, 3), device=self.device)\n\n        dirs[:, :, 0] = ((idx_w - self.cx) / self.fx)[:, None]\n        dirs[:, :, 1] = ((idx_h - self.cy) / self.fy)\n\n        if depth_type == \"euclidean\":\n            raise Exception(\n                \"Get camera rays directions with euclidean depth not yet implemented\"\n            )\n            norm = torch.norm(dirs, dim=-1)\n            dirs = dirs * (1. / norm)[:, :, :, None]\n\n        return dirs\n\n"
  }
]