Repository: scene-verse/SceneVerse Branch: main Commit: 140d172620c3 Files: 161 Total size: 848.7 KB Directory structure: gitextract_y3zcu4ix/ ├── .gitignore ├── DATA.md ├── LICENSE ├── README.md ├── TRAIN.md ├── common/ │ ├── box_utils.py │ ├── dist_utils.py │ ├── io_utils.py │ ├── launch_utils.py │ ├── misc.py │ └── type_utils.py ├── configs/ │ └── final/ │ ├── all_anno.yaml │ ├── all_nomlm.yaml │ ├── all_noobj.yaml │ ├── all_noscene.yaml │ ├── all_pretrain.yaml │ ├── all_pretrain_125.yaml │ ├── all_pretrain_25.yaml │ ├── all_pretrain_50.yaml │ ├── all_pretrain_75.yaml │ ├── all_pretrain_objcap.yaml │ ├── all_pretrain_objcap_notemplate.yaml │ ├── all_pretrain_s3d.yaml │ ├── all_pretrain_unfreeze.yaml │ ├── all_rewrite.yaml │ ├── all_template.yaml │ ├── all_wo_both.yaml │ ├── all_wo_both_125.yaml │ ├── all_wo_both_25.yaml │ ├── all_wo_both_50.yaml │ ├── all_wo_multiscan.yaml │ ├── all_wo_scannet.yaml │ ├── debug.yaml │ ├── finetune/ │ │ ├── multiscan_finetune.yaml │ │ ├── multiscan_woL.yaml │ │ ├── nr3d_finetune.yaml │ │ ├── scannet_woL.yaml │ │ ├── scanqa_finetune.yaml │ │ ├── scanrefer_finetune.yaml │ │ ├── sqa3d_finetune.yaml │ │ └── sr3d_finetune.yaml │ ├── multiscan_only.yaml │ ├── nr3d_only.yaml │ ├── procthor_only.yaml │ ├── s3d_only.yaml │ ├── scanrefer_only.yaml │ ├── scanrefer_only_gttest.yaml │ └── sr3d_only.yaml ├── data/ │ ├── __init__.py │ ├── build.py │ ├── data_utils.py │ └── datasets/ │ ├── __init__.py │ ├── arkitscene.py │ ├── base.py │ ├── constant.py │ ├── data_augmentor.py │ ├── dataset_wrapper.py │ ├── hm.py │ ├── multiscan.py │ ├── procthor.py │ ├── rscan.py │ ├── scannet.py │ ├── scannet_base.py │ ├── scannet_old.py │ └── structure3d.py ├── evaluator/ │ ├── __init__.py │ ├── build.py │ ├── objcls_eval.py │ ├── pretrain_eval.py │ ├── referit3d_eval.py │ ├── scanqa_eval.py │ ├── scanrefer_eval.py │ └── sqa3d_eval.py ├── launch.py ├── model/ │ ├── __init__.py │ ├── build.py │ ├── objcls.py │ └── openvocab.py ├── modules/ │ ├── __init__.py │ ├── build.py │ ├── grounding/ │ │ ├── __init__.py │ │ └── unified_encoder.py │ ├── heads/ │ │ ├── __init__.py │ │ ├── grounding_head.py │ │ ├── pretrain_head.py │ │ └── qa_head.py │ ├── language/ │ │ ├── __init__.py │ │ ├── bert.py │ │ └── clip.py │ ├── layers/ │ │ ├── pointnet.py │ │ └── transformers.py │ ├── third_party/ │ │ ├── __init__.py │ │ └── pointnet2/ │ │ ├── _ext_src/ │ │ │ ├── include/ │ │ │ │ ├── ball_query.h │ │ │ │ ├── cuda_utils.h │ │ │ │ ├── group_points.h │ │ │ │ ├── interpolate.h │ │ │ │ ├── sampling.h │ │ │ │ └── utils.h │ │ │ └── src/ │ │ │ ├── ball_query.cpp │ │ │ ├── ball_query_gpu.cu │ │ │ ├── bindings.cpp │ │ │ ├── group_points.cpp │ │ │ ├── group_points_gpu.cu │ │ │ ├── interpolate.cpp │ │ │ ├── interpolate_gpu.cu │ │ │ ├── sampling.cpp │ │ │ └── sampling_gpu.cu │ │ ├── _version.py │ │ ├── pointnet2_modules.py │ │ ├── pointnet2_test.py │ │ ├── pointnet2_utils.py │ │ ├── pytorch_utils.py │ │ ├── requirements_new.txt │ │ └── setup.py │ ├── utils.py │ ├── vision/ │ │ ├── __init__.py │ │ ├── obj_cls_encoder.py │ │ └── pcd_openvocab_encoder.py │ └── weights.py ├── optim/ │ ├── __init__.py │ ├── build.py │ ├── loss/ │ │ ├── __init__.py │ │ ├── contra_loss.py │ │ └── loss.py │ ├── optimizer/ │ │ ├── __init__.py │ │ └── optim.py │ ├── scheduler.py │ └── utils.py ├── preprocess/ │ ├── README.md │ ├── __init__.py │ ├── arkitscenes.py │ ├── build.py │ ├── multiscan.py │ ├── rscan.py │ ├── sceneverse2hmsemantic.py │ ├── ssg/ │ │ ├── README.md │ │ ├── relationships/ │ │ │ ├── camera.py │ │ │ ├── hanging.py │ │ │ ├── init.py │ │ │ ├── multi_objs.py │ │ │ ├── proximity.py │ │ │ └── support.py │ │ ├── ssg_data/ │ │ │ ├── dictionary.py │ │ │ ├── script/ │ │ │ │ └── ObjNode.py │ │ │ └── ssg_visualize.py │ │ ├── ssg_main.py │ │ └── ssg_utils.py │ ├── structured3d.py │ └── utils/ │ ├── __init__.py │ ├── align_utils.py │ ├── constant.py │ └── label_convert.py ├── requirements.txt ├── run.py ├── trainer/ │ ├── __init__.py │ ├── build.py │ ├── debug_trainer.py │ ├── default_trainer.py │ ├── objpretrain_trainer.py │ └── openvocab_trainer.py └── visualize_data.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: DATA.md ================================================ ## Data * Note: As some of our users requested the mapping between HM3D object id in SceneVerse to HM3D-semantics, we have added an additional file ([HM3D_tgtID2objID.zip](assets/HM3D_tgtID2objID.zip)) to obtain this mapping. The json file for each scene contains a dictionary of ```{:[hm3d_objid, hm3d_label]}```. ### Data Processing We release a data preprocessing exemplar for 3RScan, MultiScan, ARKitScenes and Structured3D, with more details [here](preprocess/README.md). We also release the [scripts](preprocess/ssg/README.md) for scene graph generation. ### Data Download We currently host our data on G-drive and request all applicants to fill out the form from [here](https://forms.gle/AXMk7MH6bFXpCqd99). You should see one or multiple zip file segments for each dataset we provided. For datasets with multiple segments (e.g., ARKitScenes), you can unzip the files with: ```shell # Directories with multiple zip segments $ ls ARKitScenes/ -> ARKitScenes.zip ARKitScenes.z01 # Unzip from all zip segments $ cd ARKitScenes/ $ zip -F ARKitScenes.zip --out combined.zip $ unzip combined.zip ``` After unzipping, the files are organized as: ```shell ARKitScenes/ |-- scan_data # Point cloud data |-- instance_id_to_label # Reorganized instance id to label mapping |-- pcd_with_global_alignment # Aligned scene point clouds |-- annotations # Language annotations |-- splits |-- train_split.txt # For all datasets, we provide training split |-- val_split.txt # For datasets with evaluation sets |-- .json # For datasets except for ScanNet, language for ScanNet is located at annotations/refer ``` ### Data Visualization For data browsing, we experimented with NVIDIA CUDA 11.8 on Ubuntu 22.04 and require the following steps: ```shell $ conda create -n sceneverse python=3.9 $ pip install torch==2.2.0 torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cu118 $ pip install numpy open3d ``` We provide a short script for visualizing scene and language data, you can use it with: ```shell # Visualize scene and instance data $ python visualize_data.py --root --dataset # Visualize language data $ python visualize_data.py --root --dataset --vis_refer ``` As our data contains scenes from existing datasets, please read carefully about the term of use for each dataset we provided in the form. ### Provided Language Types We list the available data in the current version of SceneVerse in the table below: | Dataset | Object Caption | Scene Caption | Ref-Annotation | Ref-Pairwise
```rel2``` | Ref-MultiObject
```relm``` | Ref-Star
```star``` | Ref-Chain (Optional)
```chain``` | |:------------:|:--------------:|:-------------:|------------------|-------------------------|-------------------------------|-----------------------|------------------------------------| | ScanNet | ✅ | ✅ | ScanRefer
Nr3D | ✅ | ✅ | ✅ | ✅ | | MultiScan | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ARKitScenes | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | HM3D | ```template``` | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 3RScan | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | | Structured3D | ```template``` | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | | ProcTHOR | ```template``` | ❌ | ❌ | ```template``` | ```template``` | ```template``` | ❌ | For the generated object referrals, we provide both the direct template-based generations ```template``` and the LLM-refined versions ```gpt```. Please refer to our supplementary for the description of selected ```pair-wise``` / ```multi-object``` / ```star``` types. We also provide the ```chain``` type which contains language using obejct A to refer B and then B to refer the target object C. As we found the ```chain``` type could sometimes lead to unnatural descriptions, we did not discuss it in the main paper. Feel free to inspect and use it in your projects. For the remaining data, we hope to further refine and update our data in the following weeks, stay tuned! ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 scene-verse Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene Understanding

   
SceneVerse Teaser
We propose SceneVerse, the first million-scale 3D vision-language dataset with 68K 3D indoor scenes and 2.5M vision-language pairs. We demonstrate the scaling effect by (i) achieving state-of-the-art on all existing 3D visual grounding benchmarks and (ii) showcasing zero-shot transfer capabilities with our GPS (Grounded Pre-training for Scenes) model. ## News - ![](https://img.shields.io/badge/New!-8A2BE2) [2024-12] Our follow-up work on situated question answering on SceneVerse is out, check it out [here](https://msr3d.github.io/)! - [2024-10] Pre-trained checkpoints are now available, find detailed instructions in [TRAIN.md](TRAIN.md)! - [2024-09] The scripts for scene graph generation are released. - [2024-07] Training & Inference code as well as preprocessing code is released and checkpoints & logs are on the way! - [2024-07] Preprocessing codes for scenes used in SceneVerse are released. - [2024-07] SceneVerse is accepted by ECCV 2024! Training and inference codes/checkpoints will come shortly, stay tuned! - [2024-03] We release the data used in SceneVerse. Fill out the [form](https://forms.gle/AXMk7MH6bFXpCqd99) for the download link! - [2024-01] We release SceneVerse on ArXiv. Checkout our [paper](https://arxiv.org/abs/2401.09340) and [website](https://scene-verse.github.io/). ## Data See [DATA.md](DATA.md) for detailed instructions on data download, processing, visualization. The data inventory is listed below: | Dataset | Object Caption | Scene Caption | Ref-Annotation | Ref-Pairwise
```rel2``` | Ref-MultiObject
```relm``` | Ref-Star
```star``` | Ref-Chain (Optional)
```chain``` | |:------------:|:--------------:|:-------------:|------------------|-------------------------|-------------------------------|-----------------------|------------------------------------| | ScanNet | ✅ | ✅ | ScanRefer
Nr3D | ✅ | ✅ | ✅ | ✅ | | MultiScan | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ARKitScenes | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | HM3D | ```template``` | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 3RScan | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | | Structured3D | ```template``` | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | | ProcTHOR | ```template``` | ❌ | ❌ | ```template``` | ```template``` | ```template``` | ❌ | ## Training and Inference See [TRAIN.md](TRAIN.md) for the inventory of available checkpoints and detailed instructions on training and testing with pre-trained checkpoints. The checkpoint inventory is listed below: | Setting | Description | Corresponding Experiment | Checkpoint based on experiment setting | |----------------------|-------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ```pre-trained``` | GPS model pre-trained on SceneVerse | 3D-VL grounding (Tab.2) | [Model](https://drive.google.com/drive/folders/1FDjVaYZxHdMJgxB8stSHfI34Q7crItJc?usp=sharing) | | ```scratch``` | GPS model trained on datasets from scratch | 3D-VL grounding (Tab.2)
SceneVerse-val (Tab. 3) | [ScanRefer](https://drive.google.com/drive/folders/1d7sGm_D7kyj6Fmo0f8b6DPrhWYUCtWVq?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1bKGgXot8Sc6BB2MWAfW_OGdu0iq0RWZt?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/14K-UaIeg0GHWFoaonIFHTHZZbDotukzV?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/1CeWwLIPEuK0b35I_gbiwu_OiaUEE42jD?usp=drive_link) | | ```fine-tuned``` | GPS model fine-tuned on datasets with grounding heads | 3D-VL grounding (Tab.2) | [ScanRefer](https://drive.google.com/drive/folders/1P5YprjIlBMAl0OQ38jgTDJyuFVIGiCMS?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1-LMYW6jy5wpqL_KlQQuvuSM7TDyo7M3g?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/1sw-_hhF2__JgGCHE1yfyAQeNZ7jSrID0?usp=sharing) | | ```zero-shot``` | GPS model trained on SceneVerse without data from ScanNet and MultiScan | Zero-shot Transfer (Tab.3) | [Model](https://drive.google.com/drive/folders/11824oiZnaU8ChsNpH8zZKIT2i1PdJWSA?usp=sharing) | | ```zero-shot text``` | GPS | Zero-shot Transfer (Tab.3) | [ScanNet](https://drive.google.com/drive/folders/1TKIhb7xgGzwDiAdvznwTpKzkcJnG7GD0?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/18f65Q6313sa-blLCyspqjZRmWpKJPh3M?usp=sharing) | | ```text-ablation``` | Ablations on the type of language used during pre-training | Ablation on Text (Tab.7) | [Template only](https://drive.google.com/drive/folders/1Xo6FkbThHP3uLUJMblt3zgJiM0n3RbVK?usp=sharing), [Template+LLM](https://drive.google.com/drive/folders/1w9Oi8nWKZXOW3BcA0eiC1bgp7snk8ZKS?usp=sharing) | | ```scene-ablation``` | Ablations on the use of synthetic scenes during pre-training | Ablation on Scene (Tab.8) | [Real only](https://drive.google.com/drive/folders/1WZDf2BS7eG36NgGEdTuChICmVHF377is?usp=sharing), [S3D only](https://drive.google.com/drive/folders/1Zh4QfCs6l67ZeltvzOPZtokKkgkvxATc?usp=sharing), [ProcTHOR only](https://drive.google.com/drive/folders/1H9zm7vYxVn_zd2HYi49Js9R34AHnGi1d?usp=sharing) | | ```model-ablation``` | Ablations on the use of losses during pre-training | Ablation on Model Design (Tab.9) | [Refer only](https://drive.google.com/drive/folders/1yKF8dVPlcbKb-COcfUZbwcqWxt_uvzuc?usp=sharing), [Refer+Obj-lvl](https://drive.google.com/drive/folders/1C5L20UvTQj2my2t0BnqHZPsb_VaXxVjX?usp=sharing), [w/o Scene-lvl](https://drive.google.com/drive/folders/14jR43ils1-jop6K84hu1AqPqU9DcHucx?usp=sharing) | | ```3d-qa``` | Results for QA fine-tuning on ScanQA and SQA3D | 3D-QA Experiments (Tab.5) | [ScanQA](https://drive.google.com/drive/folders/1_Qluyeu-gvfyQSRoPNcPg7qss5IxFRwO?usp=sharing), [SQA3D](https://drive.google.com/drive/folders/1DGVqsqP12Y2Un10UAC5u9HLij0NJVzJC?usp=sharing) | ## BibTex ```bibtex @inproceedings{jia2024sceneverse, title={Sceneverse: Scaling 3d vision-language learning for grounded scene understanding}, author={Jia, Baoxiong and Chen, Yixin and Yu, Huangyue and Wang, Yan and Niu, Xuesong and Liu, Tengyu and Li, Qing and Huang, Siyuan}, booktitle={European Conference on Computer Vision (ECCV)}, year={2024} } ``` ## Acknowledgements We thank the authors from [ScanRefer](https://github.com/daveredrum/ScanRefer), [ScanNet](https://github.com/ScanNet/ScanNet), [3RScan](https://github.com/WaldJohannaU/3RScan), [ReferIt3D](https://github.com/referit3d/referit3d), [Structured3D](https://github.com/bertjiazheng/Structured3D), [HM3D](https://github.com/matterport/habitat-matterport-3dresearch), [ProcTHOR](https://github.com/allenai/procthor), [ARKitScenes](https://github.com/apple/ARKitScenes), [MultiScan](https://github.com/smartscenes/multiscan) for open-sourcing their awesome datasets. We also heavily adapted codes from [ScanQA](https://github.com/ATR-DBI/ScanQA), [SQA3D](https://github.com/SilongYong/SQA3D), and [3D-VisTA](https://github.com/3d-vista/3D-VisTA) for training and inference. ================================================ FILE: TRAIN.md ================================================ # Training and Inference ## Environment Setup To install the environment requirements needed for SceneVerse, you can run the installation scripts provided by: ```bash $ conda env create -n sceneverse python=3.9 $ conda activate sceneverse $ pip install --r requirements.txt ``` Meanwhile, SceneVerse depends on an efficient implementation of PointNet2 which is located in ```modules```. Remember to install it with ```bash $ cd modules/third_party/pointnet2 $ python setup.py install $ cd ../.. ``` ## Model Configurations ### 1. Experiment Setup We provide all experiment configurations in ```configs/final```, you can find the experiment setting in the top of comment each experiment file. To correctly use the configuration files, you need to change the following fields in the configuration file to load paths correctly: - ```base_dir```: save path for model checkpoints, configurations, and logs. - ```logger.entity```: we used W&B for logging experiments, change it to your corresponding account. - ```data.{DATASET}_familiy_base```: path to ```{Dataset}``` related data. - ```model.vision.args.path```: path to the pre-trained object encoder (PointNet++). - ```model.vision.args.lang_path```: deprecated, but basically text embeddings of the 607 classes in ScanNet. You can walk through the ```configs/final/all_pretrain.yaml``` and compare it with other files to see how we controlled data and objectives used in training. ## Experiments ### 1. Training and Inference This codebase leverages [Huggingface Accelerate](https://huggingface.co/docs/accelerate/index) package and [Facebook Submitit](https://github.com/facebookincubator/submitit) package for efficient model training on multi-node clusters. We provide a launcher file ```launch.py``` which provides three ways of launching experiment: ```bash # Launching using submitit on a SLURM cluster (e.g. 10 hour 1 node 4 GPU experiment with config file $CONFIG) $ python launch.py --mode submitit --time 10 --qos $QOS --partition $PARTITION --mem_per_gpu 80 \ --gpu_per_node 4 --config $CONFIG note=$NOTE name=$EXP_NAME # Launching using accelerator with a multi-gpu instance $ python launch.py --mode accelerate --gpu_per_node 4 --num_nodes 1 -- config $CONFIG note=$NOTE name=$EXP_NAME ``` Basically, ```launch.py``` set up process(es) to run the main entry point ```run.py``` under multi GPU settings. You can directly overwrite configurations in the configuration file ```$CONFIG``` by setting property fields using ```=``` after all command line arguments. (e.g., ```name=$EXP_NAME```,```solver.epochs=400```,```dataloader.batchsize=4```) For testing and inference, remember to set up the testing data correctly under each configuration files and switch the ```mode``` field in the configurations into ```test``` (i.e., ```mode=test```). ### 2. Debugging If you want to debug your code without an additional job launcher, you can also directly run the file ```run.py``` . As an example, you can directly run the file for debugging with ```bash # Single card direct run for debugging purposes $ python run.py --config-path ${PROJ_PATH}/configs/final/ --config-name ${EXP_CONFIG_NAME}.yaml \ num_gpu=1 hydra.run.dir=. hydra.output_subdir=null hydra/job_logging=disabled hydra/hydra_logging=disabled \ debug.flag=True debug.debug_size=1 dataloader.batchsize=2 debug.hard_debug=True name=Debug_test ``` ## Checkpoints We provide all available checkpoints under the same data directory, named after ```Checkpoints```. Here we provide detailed descriptions of checkpoint in the table below: | Setting | Description | Corresponding Experiment | Checkpoint based on experiment setting | |----------------------|-------------------------------------------------------------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ```pre-trained``` | GPS model pre-trained on SceneVerse | 3D-VL grounding (Tab.2) | [Model](https://drive.google.com/drive/folders/1FDjVaYZxHdMJgxB8stSHfI34Q7crItJc?usp=sharing) | | ```scratch``` | GPS model trained on datasets from scratch | 3D-VL grounding (Tab.2)
SceneVerse-val (Tab. 3) | [ScanRefer](https://drive.google.com/drive/folders/1d7sGm_D7kyj6Fmo0f8b6DPrhWYUCtWVq?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1bKGgXot8Sc6BB2MWAfW_OGdu0iq0RWZt?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/14K-UaIeg0GHWFoaonIFHTHZZbDotukzV?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/1CeWwLIPEuK0b35I_gbiwu_OiaUEE42jD?usp=drive_link) | | ```fine-tuned``` | GPS model fine-tuned on datasets with grounding heads | 3D-VL grounding (Tab.2) | [ScanRefer](https://drive.google.com/drive/folders/1P5YprjIlBMAl0OQ38jgTDJyuFVIGiCMS?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1-LMYW6jy5wpqL_KlQQuvuSM7TDyo7M3g?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/1sw-_hhF2__JgGCHE1yfyAQeNZ7jSrID0?usp=sharing) | | ```zero-shot``` | GPS model trained on SceneVerse without data from ScanNet and MultiScan | Zero-shot Transfer (Tab.3) | [Model](https://drive.google.com/drive/folders/11824oiZnaU8ChsNpH8zZKIT2i1PdJWSA?usp=sharing) | | ```zero-shot text``` | GPS | Zero-shot Transfer (Tab.3) | [ScanNet](https://drive.google.com/drive/folders/1TKIhb7xgGzwDiAdvznwTpKzkcJnG7GD0?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/18f65Q6313sa-blLCyspqjZRmWpKJPh3M?usp=sharing) | | ```text-ablation``` | Ablations on the type of language used during pre-training | Ablation on Text (Tab.7) | [Template only](https://drive.google.com/drive/folders/1Xo6FkbThHP3uLUJMblt3zgJiM0n3RbVK?usp=sharing), [Template+LLM](https://drive.google.com/drive/folders/1w9Oi8nWKZXOW3BcA0eiC1bgp7snk8ZKS?usp=sharing) | | ```scene-ablation``` | Ablations on the use of synthetic scenes during pre-training | Ablation on Scene (Tab.8) | [Real only](https://drive.google.com/drive/folders/1WZDf2BS7eG36NgGEdTuChICmVHF377is?usp=sharing), [S3D only](https://drive.google.com/drive/folders/1Zh4QfCs6l67ZeltvzOPZtokKkgkvxATc?usp=sharing), [ProcTHOR only](https://drive.google.com/drive/folders/1H9zm7vYxVn_zd2HYi49Js9R34AHnGi1d?usp=sharing) | | ```model-ablation``` | Ablations on the use of losses during pre-training | Ablation on Model Design (Tab.9) | [Refer only](https://drive.google.com/drive/folders/1yKF8dVPlcbKb-COcfUZbwcqWxt_uvzuc?usp=sharing), [Refer+Obj-lvl](https://drive.google.com/drive/folders/1C5L20UvTQj2my2t0BnqHZPsb_VaXxVjX?usp=sharing), [w/o Scene-lvl](https://drive.google.com/drive/folders/14jR43ils1-jop6K84hu1AqPqU9DcHucx?usp=sharing) | | ```3d-qa``` | Results for QA fine-tuning on ScanQA and SQA3D | 3D-QA Experiments (Tab.5) | [ScanQA](https://drive.google.com/drive/folders/1_Qluyeu-gvfyQSRoPNcPg7qss5IxFRwO?usp=sharing), [SQA3D](https://drive.google.com/drive/folders/1DGVqsqP12Y2Un10UAC5u9HLij0NJVzJC?usp=sharing) | To properly use the pre-trained checkpoints, you can use the ```pretrain_ckpt_path``` key in the configs: ```shell # Directly testing the checkpoint $ python launch.py --mode submitit --qos $QOS --partition $PARTITION --mem_per_gpu 80 \ --gpu_per_node 4 --config $CONFIG note=$NOTE name=$EXP_NAME mode=test \ pretrain_ckpt_path=$PRETRAIN_CKPT # Fine-tuning with pre-trained checkpoint $ python launch.py --mode submitit --qos $QOS --partition $PARTITION --mem_per_gpu 80 \ --gpu_per_node 4 --config $CONFIG note=$NOTE name=$EXP_NAME \ pretrain_ckpt_path=$PRETRAIN_CKPT ``` For fine-tuning the pre-trained checkpoint on datasets, you can use the fine-tuning config files provided under ```configs/final/finetune```. ================================================ FILE: common/box_utils.py ================================================ import numpy as np def box3d_iou(corners1, corners2): ''' Compute 3D bounding box IoU. Input: corners1: numpy array (8,3), assume up direction is Z corners2: numpy array (8,3), assume up direction is Z Output: iou: 3D bounding box IoU ''' x_min_1, x_max_1, y_min_1, y_max_1, z_min_1, z_max_1 = get_box3d_min_max(corners1) x_min_2, x_max_2, y_min_2, y_max_2, z_min_2, z_max_2 = get_box3d_min_max(corners2) xA = np.maximum(x_min_1, x_min_2) yA = np.maximum(y_min_1, y_min_2) zA = np.maximum(z_min_1, z_min_2) xB = np.minimum(x_max_1, x_max_2) yB = np.minimum(y_max_1, y_max_2) zB = np.minimum(z_max_1, z_max_2) inter_vol = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0) * np.maximum((zB - zA), 0) box_vol_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1) * (z_max_1 - z_min_1) box_vol_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2) * (z_max_2 - z_min_2) iou = inter_vol / (box_vol_1 + box_vol_2 - inter_vol + 1e-8) return iou def get_box3d_min_max(corner): ''' Compute min and max coordinates for 3D bounding box Note: only for axis-aligned bounding boxes Input: corners: numpy array (8,3), assume up direction is Z (batch of N samples) Output: box_min_max: an array for min and max coordinates of 3D bounding box IoU ''' min_coord = corner.min(axis=0) max_coord = corner.max(axis=0) x_min, x_max = min_coord[0], max_coord[0] y_min, y_max = min_coord[1], max_coord[1] z_min, z_max = min_coord[2], max_coord[2] return x_min, x_max, y_min, y_max, z_min, z_max def get_3d_box(center, box_size): ''' box_size is array(l,w,h), heading_angle is radius clockwise from pos x axis, center is xyz of box center output (8,3) array for 3D box cornders Similar to utils/compute_orientation_3d ''' l,w,h = box_size # x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2] # y_corners = [h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2] # z_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2] x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2] y_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2] z_corners = [h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2] corners_3d = np.vstack([x_corners,y_corners,z_corners]) corners_3d[0,:] = corners_3d[0,:] + center[0] corners_3d[1,:] = corners_3d[1,:] + center[1] corners_3d[2,:] = corners_3d[2,:] + center[2] corners_3d = np.transpose(corners_3d) return corners_3d ================================================ FILE: common/dist_utils.py ================================================ import functools import pickle import torch import torch.distributed as dist import logging logger = logging.getLogger(__name__) ########################### Basic utility for distributed info ################################ def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True def get_rank(): """ Get the rank of the current process. """ if not is_dist_avail_and_initialized(): return 0 return dist.get_rank() def get_world_size(): """ Get the size of the world. """ if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size() def is_master_proc(num_gpus=8): """ Determines if the current process is the master process on each node. """ if is_dist_avail_and_initialized(): return dist.get_rank() % num_gpus == 0 else: return True def is_root_proc(): """ Determines if the current process is the root process. """ if is_dist_avail_and_initialized(): return dist.get_rank() == 0 else: return True ############################## Data gathering across devices ################################## def _serialize_to_tensor(data, group, max_size=1024): """ Serialize the tensor to ByteTensor. Note that only `gloo` and `nccl` backend is supported. Args: data (data): data to be serialized. group (group): pytorch dist group. Returns: tensor (ByteTensor): tensor that serialized. """ backend = dist.get_backend(group) assert backend in ["gloo", "nccl"] device = torch.device("cpu" if backend == "gloo" else "cuda") buffer = pickle.dumps(data) if len(buffer) > max_size ** 3: logger.warning( "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( get_rank(), len(buffer) / (max_size ** 3), device ) ) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to(device=device) return tensor def _pad_to_largest_tensor(tensor, group): """ Padding all the tensors from different GPUs to the largest ones. Args: tensor (tensor): tensor to pad. group (group): pytorch dist group. Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor( [tensor.numel()], dtype=torch.int64, device=tensor.device ) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros( (max_size - local_size,), dtype=torch.uint8, device=tensor.device ) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor def broadcast(object): if isinstance(object, torch.Tensor): dist.broadcast(tensor=object, src=0) else: sync_tensor = torch.Tensor([object]).cuda() dist.broadcast(tensor=sync_tensor, src=0) object = sync_tensor[0].item() return object def all_gather(tensors): """ All gathers the provided tensors from all processes across machines. Args: tensors (list): tensors to perform all gather across all processes in all machines. """ gather_list = [] output_tensor = [] world_size = dist.get_world_size() for tensor in tensors: tensor_placeholder = [ torch.ones_like(tensor) for _ in range(world_size) ] dist.all_gather(tensor_placeholder, tensor, async_op=False) gather_list.append(tensor_placeholder) for gathered_tensor in gather_list: output_tensor.append(torch.cat(gathered_tensor, dim=0)) return output_tensor def all_reduce(tensors, average=True): """ All reduce the provided tensors from all processes across machines. Args: tensors (list): tensors to perform all reduce across all processes in all machines. average (bool): scales the reduced tensor by the number of overall processes across all machines. """ for tensor in tensors: dist.all_reduce(tensor, async_op=False) if average: world_size = dist.get_world_size() for tensor in tensors: tensor.mul_(1.0 / world_size) return tensors @functools.lru_cache() def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. Returns: (group): pytorch dist group. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD def all_gather_unaligned(data, group=None): """ Run all_gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: list of data gathered from each rank """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() if dist.get_world_size(group) == 1: return [data] tensor = _serialize_to_tensor(data, group) size_list, tensor = _pad_to_largest_tensor(tensor, group) max_size = max(size_list) # receiving Tensor from all ranks tensor_list = [ torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list ] dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list ================================================ FILE: common/io_utils.py ================================================ import csv import pickle import json import cv2 import yaml import numpy as np from pathlib import Path import torch import open3d from plyfile import PlyData def make_dir(dir_path): if not Path(dir_path).exists(): Path(dir_path).mkdir(parents=True, exist_ok=True) def load_imgs(img_paths, option=cv2.IMREAD_COLOR): imgs = [cv2.imread(img_path, option) for img_path in img_paths] return imgs def load_pickle(filename): with Path(filename).open("rb") as f: return pickle.load(f) def save_pickle(data, filename): with Path(filename).open("wb") as f: pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) def load_json(filename): with Path(filename).open("rb") as f: return json.load(f) def save_json(data, filename, save_pretty=True, sort_keys=False): with Path(filename).open("w") as f: if save_pretty: f.write(json.dumps(data, indent=4, sort_keys=sort_keys)) else: json.dump(data, f) def load_jsonl(filename): with Path(filename).open("r") as f: return [json.loads(l.strip("\n")) for l in f.readlines()] def save_jsonl(data, filename): with Path(filename).open("w") as f: f.write("\n".join([json.dumps(e) for e in data])) def load_yaml(filename): with Path(filename).open("r") as f: return yaml.load(f, Loader=yaml.SafeLoader) def save_yaml(data, filename): with Path(filename).open("w") as f: yaml.dump(data, f, default_flow_style=False) def load_csv(filename, delimiter=","): idx2key = None contents = {} with Path(filename).open("r") as f: reader = csv.reader(f, delimiter=delimiter) for l_idx, row in reader: if l_idx == 0: idx2key = row for k_idx, key in enumerate(idx2key): contents[key] = [] else: for c_idx, col in enumerate(row): contents[idx2key[c_idx]].append(col) return contents, idx2key def save_csv(data, filename, cols=None, delimiter=","): with Path(filename).open("w") as f: writer = csv.writer(f, delimiter=delimiter) num_entries = len(data[list(data.keys())[0]]) assert cols is not None, "Must have column names for dumping csv files." writer.writerow(cols) for l_idx in range(num_entries): row = [data[key][l_idx] for key in cols] writer.writerow(row) def load_numpy(filename): return np.load(filename, allow_pickle=True) def save_numpy(data, filename): np.save(filename, data, allow_pickle=True) def load_tensor(filename): return torch.load(filename) def save_tensor(data, filename): torch.save(data, filename) def load_ply(filepath): with open(filepath, "rb") as f: plydata = PlyData.read(f) data = plydata.elements[0].data coords = np.array([data["x"], data["y"], data["z"]], dtype=np.float32).T feats = None labels = None if ({"red", "green", "blue"} - set(data.dtype.names)) == set(): feats = np.array([data["red"], data["green"], data["blue"]], dtype=np.uint8).T if "label" in data.dtype.names: labels = np.array(data["label"], dtype=np.uint32) return coords, feats, labels def load_ply_with_normals(filepath): mesh = open3d.io.read_triangle_mesh(str(filepath)) if not mesh.has_vertex_normals(): mesh.compute_vertex_normals() vertices = np.asarray(mesh.vertices) normals = np.asarray(mesh.vertex_normals) coords, feats, labels = load_ply(filepath) assert np.allclose(coords, vertices), "different coordinates" feats = np.hstack((feats, normals)) return coords, feats, labels ================================================ FILE: common/launch_utils.py ================================================ import os from pathlib import Path import subprocess import submitit huggingface_fix = f"TRANSFORMERS_OFFLINE=1 CURL_CA_BUNDLE=''" class SubmititLauncher: def __init__(self, args): self.args = args def __call__(self): host_name = os.popen( "scontrol show hostnames $SLURM_JOB_NODELIST" ).read().split("\n")[0] self._set_gpu_args() # Using Accelerate for launching multi_gpu = "--multi_gpu" if self.args.num_nodes * self.args.gpu_per_node > 1 else "" opts = " ".join(self.args.opts) if len(self.args.opts) > 0 else "" opts += f" num_gpu={self.args.num_nodes * self.args.gpu_per_node} " full_cfg_path = Path(self.args.config) cfg_path, cfg_file = str(full_cfg_path.parent), str(full_cfg_path.name) cmd = f"{huggingface_fix} accelerate launch --num_machines {self.args.num_nodes} \ --mixed_precision {self.args.mixed_precision} {multi_gpu} \ --num_processes {self.args.gpu_per_node * self.args.num_nodes} \ --num_cpu_threads_per_process {self.args.cpu_per_task} \ --main_process_ip {host_name} \ --main_process_port {self.args.port} \ --machine_rank {self.args.node_id} \ --dynamo_backend no \ {self.args.run_file} \ --config-path {cfg_path} \ --config-name {cfg_file} \ num_gpu={self.args.num_nodes * self.args.gpu_per_node} \ hydra.run.dir=. \ hydra.output_subdir=null \ hydra/job_logging=disabled \ hydra/hydra_logging=disabled {opts}" subprocess.run(cmd, shell=True) def _set_gpu_args(self): job_env = submitit.JobEnvironment() self.args.job_dir = str(self.args.job_dir).replace("%j", job_env.job_id) self.args.node_id = int(job_env.global_rank / self.args.gpu_per_node) def submitit_launch(args): """ Multi node script launching with Submitit """ additional_parameters = {} if args.nodelist != "": # if specifying node id nodelist = f"{str(args.nodelist)}" additional_parameters["nodelist"] = nodelist executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) executor.update_parameters( name=args.name, mem_gb=args.mem_per_gpu * args.gpu_per_node * args.num_nodes, gpus_per_node=args.gpu_per_node, tasks_per_node=1, cpus_per_task=args.gpu_per_node * args.cpu_per_task, nodes=args.num_nodes, slurm_qos=args.qos, slurm_partition=args.partition, slurm_account=args.account, slurm_time=args.time * 60, slurm_signal_delay_s=120, slurm_additional_parameters=additional_parameters ) launcher = SubmititLauncher(args) job = executor.submit(launcher) print(f"submitted job: {job.job_id}") def accelerate_launch(args): """ Single node script launching with Accelerate """ opts = " ".join(args.opts) if len(args.opts) > 0 else "" opts += f" num_gpu={args.num_nodes * args.gpu_per_node} " multi_gpu = "--multi_gpu" if args.num_nodes * args.gpu_per_node > 1 else "" full_cfg_path = Path(args.config) cfg_path, cfg_file = str(full_cfg_path.parent), str(full_cfg_path.name) cmd = f"{huggingface_fix} accelerate launch --num_machines {args.num_nodes} \ {multi_gpu} \ --mixed_precision {args.mixed_precision} \ --num_processes {args.gpu_per_node * args.num_nodes} \ --num_cpu_threads_per_process {args.cpu_per_task} \ --dynamo_backend no \ {args.run_file} \ --config-path {cfg_path} \ --config-name {cfg_file} \ num_gpu={args.num_nodes * args.gpu_per_node} \ hydra.run.dir=. \ hydra.output_subdir=null \ hydra/job_logging=disabled \ hydra/hydra_logging=disabled {opts}" subprocess.run(cmd, shell=True) def python_launch(args): """ Vanilla python launcher for degbugging purposes """ opts = " ".join(args.opts) if len(args.opts) > 0 else "" full_cfg_path = Path(args.config) cfg_path, cfg_file = str(full_cfg_path.parent), str(full_cfg_path.name) cmd = f"{huggingface_fix} python {args.run_file} " \ f"--config-path {cfg_path} " \ f"--config-name {cfg_file} " \ f"num_gpu=1 " \ f"hydra.run.dir=. " \ f"hydra.output_subdir=null " \ f"hydra/job_logging=disabled " \ f"hydra/hydra_logging=disabled {opts}" subprocess.run(cmd, shell=True) ================================================ FILE: common/misc.py ================================================ import os import glob import importlib import functools import torch from typing import Any from accelerate.logging import get_logger from accelerate.state import PartialState from accelerate.utils import recursively_apply from accelerate.utils.constants import TORCH_DISTRIBUTED_OPERATION_TYPES from accelerate.utils.dataclasses import DistributedType logger = get_logger(__name__) def rsetattr(obj, attr, val): pre, _, post = attr.rpartition('.') return setattr(rgetattr(obj, pre) if pre else obj, post, val) # using wonder's beautiful simplification: https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427 def rgetattr(obj, attr, *args): def _getattr(obj, attr): return getattr(obj, attr, *args) return functools.reduce(_getattr, [obj] + attr.split('.')) # def import_all(exclude_list=None): # if exclude_list is None: # exclude_list = ["__init__.py", "build.py"] # print(f"file: {__file__}") # current_directory = os.path.dirname(__file__) # module_names = [ # os.path.splitext(file)[0] for file in os.listdir(current_directory) # if file.endswith(".py") and file not in exclude_list # ] # for module_name in module_names: # module = importlib.import_module(f".{module_name}", package=__name__) # globals().update({name: getattr(module, name) for name in getattr(module, '__all__', [])}) # __all__ = [name for name in globals() if not name.startswith("_")] def _gpu_gather_object(object: Any): # by JY Huang: re-implement the method for gathering non-tensor objects output_objects = [None for _ in range(PartialState().num_processes)] torch.distributed.all_gather_object(output_objects, object) if isinstance(object, (list, tuple)): output_list = [] for item in output_objects: output_list.extend(item) return output_list elif isinstance(object, dict): template = output_objects[0] output_dict = {} for k, v in template.items(): output_dict[k] = [] for item in output_objects: if isinstance(item[k], list): output_dict[k].extend(item[k]) else: output_dict[k].append(item[k]) return output_dict def gather_object(object: Any): """ Recursively gather object in a nested list/tuple/dictionary of objects from all devices. Args: object (nested list/tuple/dictionary of picklable object): The data to gather. Returns: The same data structure as `object` with all the objects sent to every device. """ if PartialState().distributed_type == DistributedType.TPU: raise NotImplementedError("gather objects in TPU is not supported") elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES: return _gpu_gather_object(object) else: return object def gather_for_metrics(accelerator, input_data): """ by JY Huang: re-implement this method for gathering non-tensor objects Refer source code to https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.gather_for_metrics """ try: recursively_apply(lambda x: x, input_data, error_on_other_type=True) all_tensors = True except TypeError: all_tensors = False if not all_tensors: data = gather_object(input_data) else: data = accelerator.gather(input_data) try: if accelerator.gradient_state.end_of_dataloader: # at the end of a dataloader, `gather_for_metrics` regresses to # `gather` unless the dataset has a remainder so log. if accelerator.gradient_state.remainder == -1: logger.info( "The used dataset had no length, returning gathered tensors. You should drop the remainder yourself." ) return data elif accelerator.gradient_state.remainder > 0: # Last batch needs to be truncated on distributed systems as it contains additional samples def _adjust_samples(tensor): return tensor[: accelerator.gradient_state.remainder] if tensor is not None else None if all_tensors: # This only applies to tensors, as defined in `recursively_apply` return recursively_apply(_adjust_samples, data) else: if isinstance(data, (list, tuple)): return _adjust_samples(data) elif isinstance(data, dict): return {k: _adjust_samples(v) for k, v in data.items()} else: raise NotImplementedError(f"Non-tensor gather only supports list, tuple or dict") else: # remainder is 0 # no remainder even though at end of dataloader, so nothing to do. return data else: # Not at the end of the dataloader, no need to adjust the tensors return data except Exception: # Dataset had no length or raised an error return data def gather_dict(accelerator, data_dict): data_dict_non_tensor = {k : v for k, v in data_dict.items() if not isinstance(v, torch.Tensor)} data_dict_non_tensor = gather_for_metrics(accelerator, data_dict_non_tensor) data_dict = {k : v for k, v in data_dict.items() if isinstance(v, torch.Tensor)} data_dict = gather_for_metrics(accelerator, data_dict) data_dict.update(data_dict_non_tensor) return data_dict ================================================ FILE: common/type_utils.py ================================================ import torch from omegaconf import OmegaConf def cfg2dict(cfg): return OmegaConf.to_container(cfg, resolve=True) def _to_device(state, device): """ usually load from cpu checkpoint but need to load to cuda """ if isinstance(state, torch.Tensor): new_state = state.to(device, non_blocking=True) # assume propoerly set py torch.cuda.set_device elif isinstance(state, list): new_state = torch.tensor([_to_device(t, device) for t in state]).to(device) elif isinstance(state, tuple): new_state = torch.tensor(tuple(_to_device(t, device) for t in state)).to(device) elif isinstance(state, dict): new_state = {n: _to_device(t, device) for n, t in state.items()} else: try: if not isinstance(state, str): new_state = torch.tensor(state).to(device) else: new_state = state except: raise ValueError(f"The provided tensor can not be transfered to {device}") return new_state ================================================ FILE: configs/final/all_anno.yaml ================================================ ### # Pretrain with human annotation only ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all_anno" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d' ] referit3d: anno_type: ['nr3d'] sr3d_plus_aug: False sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt', 'template'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno'] val: sources: [ 'anno'] test: sources: [ 'anno'] ARKitSceneSpatialRefer: train: sources: ['anno'] val: sources: [ 'anno' ] test: sources: [ 'anno' ] HMSpatialRefer: train: sources: [ 'anno' ] val: sources: [ 'anno' ] test: sources: [ 'anno' ] use_voxel: False scan_family_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/ScanNet" rscan_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/3RScan" arkitscene_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ARKitScenes' multiscan_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/MultiScan' hm_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/HM3D' procthor_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ProcThor' s3d_base: /scratch2/generalvision/chenyixin/datasets/SceneVerse/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_nomlm.yaml ================================================ ### # Pretrain on all data without MLM loss ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all_nomlm" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ # 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', # 'TextSceneBetweenBatch' ] vis_loss_list: [ # 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', # 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_noobj.yaml ================================================ ### # Pretrain on all data without object-level alignment ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: False path: '' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ # 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', # 'TextSceneBetweenBatch' ] vis_loss_list: [ # 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', # 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_noscene.yaml ================================================ ### # Pretrain on all data without scene-level alignment ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all_noscene" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', # 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', # 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain.yaml ================================================ ### # Pretrain on all data with all losses ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt', 'template'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/ScanNet" rscan_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/3RScan" arkitscene_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ARKitScenes' multiscan_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/MultiScan' hm_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/HM3D' procthor_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ProcThor' s3d_base: /scratch2/generalvision/chenyixin/datasets/SceneVerse/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_125.yaml ================================================ ### # Pretrain on 12.5% of all data ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all0.125" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True subset_ratio: 0.125 max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_25.yaml ================================================ ### # Pretrain on 25% of all data ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all0.25" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True subset_ratio: 0.25 mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_50.yaml ================================================ ### # Pretrain on 50% of all data ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all0.50" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True subset_ratio: 0.5 mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_75.yaml ================================================ ### # Pretrain on 75% all data ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all.75" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True subset_ratio: 0.75 mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_objcap.yaml ================================================ ### # Pretrain on all data adding all object captions ### # Experiment general info name: "Debug" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','RScanSpatialRefer','HMSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt', 'template'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt','obj_caption_template'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template'] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template'] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt','obj_caption_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template'] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template'] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt','obj_caption_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt','obj_caption_template'] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt','obj_caption_template'] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt', 'obj_caption_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template'] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template'] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_objcap_notemplate.yaml ================================================ ### # Pretrain on all data without template-based object captions ### # Experiment general info name: "OV_w_Cap" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','RScanSpatialRefer','HMSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt'] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt'] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt'] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt'] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt'] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt'] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt', 'obj_caption_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template'] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template'] use_voxel: False scan_family_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/ScanNet" rscan_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/3RScan" arkitscene_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ARKitScenes' multiscan_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/MultiScan' hm_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/HM3D' procthor_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ProcThor' s3d_base: /scratch2/generalvision/chenyixin/datasets/SceneVerse/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 200 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_s3d.yaml ================================================ ### # Pretrain on all data with Structured 3D ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer','S3DSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 250 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 1000 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b512_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj+S3DPretrainObj_1113scannetws3d/2023-11-14-09:29:10.796592/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_pretrain_unfreeze.yaml ================================================ ### # Pretrain on all data with object encoder unfrozen ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 250 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 100 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: False path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_rewrite.yaml ================================================ ### # Pretrain on all LLM-refined data only ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all_rewrite" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: ['scanrefer','sgrefer','sgcaption'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_template.yaml ================================================ ### # Pretrain on all template-based generated data only ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "template_only" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: ['scanrefer', 'sgrefer'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_template', 'relm_template', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','relm_template','star_template'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['rel2_template','relm_template','star_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['rel2_template','relm_template','star_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['rel2_template','relm_template','star_template'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_wo_both.yaml ================================================ ### # Pretrain on all data without ScanNet and MultiScan data ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "wo_scannet_multiscan" train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_wo_both_125.yaml ================================================ ### # Pretrain on 12.5% of all data without ScanNet and MultiScan ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "wo_scannet_multiscan.125" train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True subset_ratio: 0.125 max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_wo_both_25.yaml ================================================ ### # Pretrain on 25% of all data without ScanNet and MultiScan ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "wo_scannet_multiscan.25" train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True subset_ratio: 0.25 max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_wo_both_50.yaml ================================================ ### # Pretrain on 50% of all data without ScanNet and MultiScan ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "wo_scannet_multiscan.50" train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True subset_ratio: 0.50 max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_wo_multiscan.yaml ================================================ ### # Pretrain on all data without MultiScan ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "wo_multiscan" train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['MultiScanSpatialRefer'] test: ['MultiScanSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: ['scanrefer','referit3d','sgrefer','sgcaption'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno' ] test: sources: [ 'anno' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/all_wo_scannet.yaml ================================================ ### # Pretrain on all data without ScanNet ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "wo_scannet" train: ['ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [''rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/debug.yaml ================================================ ### # Debugging ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 1 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/mnt/fillipo/baoxiong/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 3 hard_debug: True logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "all" train: ['RScanSpatialRefer','ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer'] # train: ['RScanSpatialRefer','ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','ProcThorSpatialRefer','S3DSpatialRefer'] # train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: False max_scene_cap_len: 300 load_scene_pcds: True max_pcd_num_points: 240000 subset_ratio: 0 ScanNetSpatialRefer: train: sources: [ 'scanrefer','referit3d','sgrefer','sgcaption'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: # sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] sources: [ 'relm_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] ProcThorSpatialRefer: train: sources: ['star_template' ] val: sources: [ 'rel2_template', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/mnt/fillipo/Datasets/SceneVerse/ScanNet" rscan_base: "/mnt/fillipo/Datasets/SceneVerse/3RScan" arkitscene_base: '/mnt/fillipo/Datasets/SceneVerse/ARKitScenes' multiscan_base: '/mnt/fillipo/Datasets/SceneVerse/MultiScan' hm_base: '/mnt/fillipo/Datasets/SceneVerse/HM3D' procthor_base: '/mnt/fillipo/Datasets/SceneVerse/ProcThor' s3d_base: '/mnt/fillipo/Datasets/SceneVerse/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'VisualizeDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "DebugTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/mnt/fillipo/baoxiong/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/mnt/fillipo/baoxiong/results/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/finetune/multiscan_finetune.yaml ================================================ ### # Finetune on MultiScan ### # Experiment general info name: "FinalOVFinetune" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "multiscan" train: ['MultiScanSpatialRefer'] val: ['MultiScanSpatialRefer'] test: ['MultiScanSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: ['sgrefer','sgcaption'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: [ 'anno' ] val: sources: [ 'anno' ] test: sources: [ 'anno' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: True p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: True value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 256 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 50 epochs_per_eval: 1 lr: 1e-4 grad_norm: 5.0 epochs: 250 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: 'pointnet_point_encoder' # args: # path: None # freeze: False name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['ground_head'] pretrain_head: name: 'PretrainHeadV1' args: hidden_size: 768 vocab_size: 30522 ground_head: name: "GroundHeadV1" args: hidden_size: 384 input_size: 768 sem_cls_size: 607 dropout: 0.3 detach_all_aux_loss: True loss_type: 'ListLoss' loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] vis_loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] ================================================ FILE: configs/final/finetune/multiscan_woL.yaml ================================================ ### # Finetune on MultiScan (unseen language) ### # Experiment general info name: "FinalOVFinetune" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "multiscan_wo_L" train: ['MultiScanSpatialRefer'] val: ['MultiScanSpatialRefer'] test: ['MultiScanSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: ['sgrefer','sgcaption'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno' ] test: sources: [ 'anno' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: True p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: True value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 256 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 50 epochs_per_eval: 1 lr: 1e-4 grad_norm: 5.0 epochs: 250 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 5000 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: 'pointnet_point_encoder' # args: # path: None # freeze: False name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['ground_head'] pretrain_head: name: 'PretrainHeadV1' args: hidden_size: 768 vocab_size: 30522 ground_head: name: "GroundHeadV1" args: hidden_size: 384 input_size: 768 sem_cls_size: 607 dropout: 0.3 detach_all_aux_loss: True loss_type: 'ListLoss' loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] vis_loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] ================================================ FILE: configs/final/finetune/nr3d_finetune.yaml ================================================ ### # Finetune on Nr3D ### # Experiment general info name: "FinalOVFinetune" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "nr3d_whead" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random ScanNetSpatialRefer: train: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: True sgrefer: anno_type: ['chain_gpt', 'chain_template', 'rel2_template', 'relm_template', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] val: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: True p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: True value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 256 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 50 epochs_per_eval: 1 lr: 1e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 5000 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: 'pointnet_point_encoder' # args: # path: None # freeze: False name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['ground_head'] pretrain_head: name: 'PretrainHeadV1' args: hidden_size: 768 vocab_size: 30522 ground_head: name: "GroundHeadV1" args: hidden_size: 384 input_size: 768 sem_cls_size: 607 dropout: 0.3 detach_all_aux_loss: True loss_type: 'ListLoss' loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] vis_loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] ================================================ FILE: configs/final/finetune/scannet_woL.yaml ================================================ ### # Finetune on ScanRefer (unseen language) ### # Experiment general info name: "FinalOVFinetune" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "scannet_wo_L" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: ['sgrefer','sgcaption'] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: ['chain_gpt', 'chain_template', 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['chain_template','chain_gpt','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','chain_template','chain_gpt','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','chain_template','chain_gpt','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','chain_template','chain_gpt','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: True p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: True value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 256 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 10 epochs_per_eval: 1 lr: 1e-4 grad_norm: 5.0 epochs: 100 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 1500 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: 'pointnet_point_encoder' # args: # path: None # freeze: False name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['ground_head'] pretrain_head: name: 'PretrainHeadV1' args: hidden_size: 768 vocab_size: 30522 ground_head: name: "GroundHeadV1" args: hidden_size: 384 input_size: 768 sem_cls_size: 607 dropout: 0.3 detach_all_aux_loss: True loss_type: 'ListLoss' loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] vis_loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] ================================================ FILE: configs/final/finetune/scanqa_finetune.yaml ================================================ # Experiment general info name: "OV_ScanQA" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/mnt/fillipo/baoxiong/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False hard_debug: False debug_size: 20 logger: name: "wandb" entity: "buzz-beater" # dataset details data: train: ['ScanNetScanQAOld'] val: ['ScanNetScanQAOld'] test: ['ScanNetScanQAOld'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False rot_aug: True ScanNetScanQAOld: train: use_unanswer: True val: use_unanswer: True test: use_unanswer: True test_file: "test_w_obj" # or "test_wo_obj" use_voxel: False scan_family_base: "/mnt/fillipo/scratch/masaccio/existing_datasets/scannet" rscan_base: "/mnt/fillipo/scratch/masaccio/existing_datasets/3RScan-base" # task details: 'pretrain', 'scanrefer', 'referit3d', 'scanqa', 'default' task: 'ScanQA' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "DefaultTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 32 num_workers: 2 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 10 epochs_per_eval: 5 lr: 1e-4 grad_norm: 5.0 epochs: 100 optim: name: "AdamW" args: betas: [0.9, 0.98] sched: name: "warmup_cosine" args: warmup_steps: 5000 eval: name: "ScanQAEval" save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: "BERTLanguageEncoder" args: weights: "bert-base-uncased" hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: "pointnet_point_encoder" # args: # path: None # freeze: False name: "PointOpenVocabEncoder" args: backbone: "pointnet++" hidden_size: 768 freeze: True path: "/mnt/fillipo/baoxiong/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/mnt/fillipo/baoxiong/results/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ["qa_head"] qa_head: name: "QAHeadV1" args: hidden_size: 768 mlp_size: 256 glimpse: 1 flat_out_size: 512 num_answers: 8864 loss_type: "ListLoss" loss_list: [ "answer_loss", 'TextObjWithinBatch', ] vis_loss_list: [ "answer_loss", 'TextObjWithinBatch', ] ================================================ FILE: configs/final/finetune/scanrefer_finetune.yaml ================================================ ### # Finetune on ScanRefer ### # Experiment general info name: "FinalOVFinetune" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "scanrefer_whead" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random ScanNetSpatialRefer: train: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['chain_gpt', 'chain_template', 'rel2_template', 'relm_template', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] val: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: True p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: True value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 256 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 50 epochs_per_eval: 1 lr: 1e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 5000 eval: train: name: 'ScanReferEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: 'pointnet_point_encoder' # args: # path: None # freeze: False name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['ground_head'] pretrain_head: name: 'PretrainHeadV1' args: hidden_size: 768 vocab_size: 30522 ground_head: name: "GroundHeadV1" args: hidden_size: 384 input_size: 768 sem_cls_size: 607 dropout: 0.3 detach_all_aux_loss: True loss_type: 'ListLoss' loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] vis_loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] ================================================ FILE: configs/final/finetune/sqa3d_finetune.yaml ================================================ # Experiment general info name: "OV_SQA3D" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/mnt/fillipo/baoxiong/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False hard_debug: False debug_size: 20 logger: name: "wandb" entity: "bigai-gvl" # dataset details data: train: ['ScanNetSQA3D'] val: ['ScanNetSQA3D'] test: ['ScanNetSQA3D'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False rot_aug: True ScanNetSQA3D: train: use_unanswer: True val: use_unanswer: True test: use_unanswer: True use_voxel: False scan_family_base: "/mnt/fillipo/scratch/masaccio/existing_datasets/scannet" rscan_base: "/mnt/fillipo/scratch/masaccio/existing_datasets/3RScan-base" # task details: 'pretrain', 'scanrefer', 'referit3d', 'scanqa', 'default' task: 'SQA3D' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "DefaultTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 32 num_workers: 2 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 10 epochs_per_eval: 5 lr: 1e-4 grad_norm: 5.0 epochs: 100 optim: name: "AdamW" args: betas: [0.9, 0.98] sched: name: "warmup_cosine" args: warmup_steps: 5000 eval: name: "SQA3DEval" save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: "BERTLanguageEncoder" args: weights: "bert-base-uncased" hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: "pointnet_point_encoder" # args: # path: None # freeze: False name: "PointOpenVocabEncoder" args: backbone: "pointnet++" hidden_size: 768 freeze: True path: "/mnt/fillipo/baoxiong/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/mnt/fillipo/baoxiong/results/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ["qa_head"] qa_head: name: "QAHeadV1" args: hidden_size: 768 mlp_size: 256 glimpse: 1 flat_out_size: 512 num_answers: 706 loss_type: "ListLoss" loss_list: [ "answer_loss" ] vis_loss_list: [ "answer_loss" ] ================================================ FILE: configs/final/finetune/sr3d_finetune.yaml ================================================ ### # Finetune on Sr3D ### # Experiment general info name: "FinalOVFinetune" rng_seed: 42 num_gpu: 2 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "sr3d_whead" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random ScanNetSpatialRefer: train: sources: ['referit3d'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['chain_gpt', 'chain_template', 'rel2_template', 'relm_template', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['referit3d'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['referit3d'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] val: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] val: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'chain_template', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: True p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: True value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'ScanFamilyDatasetWrapperOld' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: # This is a per-gpu batchsize batchsize: 256 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 50 epochs_per_eval: 1 lr: 1e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 5000 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: # name: 'pointnet_point_encoder' # args: # path: None # freeze: False name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['ground_head'] pretrain_head: name: 'PretrainHeadV1' args: hidden_size: 768 vocab_size: 30522 ground_head: name: "GroundHeadV1" args: hidden_size: 384 input_size: 768 sem_cls_size: 607 dropout: 0.3 detach_all_aux_loss: True loss_type: 'ListLoss' loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] vis_loss_list: [ # 'TextObjWithinBatch' 'og3d_loss' ] ================================================ FILE: configs/final/multiscan_only.yaml ================================================ ### # MultiScan pretrain from scratch ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "multiscan" train: ['MultiScanSpatialRefer'] val: ['MultiScanSpatialRefer'] test: ['MultiScanSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: False max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'referit3d' ] referit3d: anno_type: ['nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: [ 'anno' ] val: sources: [ 'anno' ] test: sources: [ 'anno' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/nr3d_only.yaml ================================================ ### # Nr3D pretrain from scratch ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "nr3d" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: False max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'referit3d' ] referit3d: anno_type: ['nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['referit3d'] referit3d: anno_type: ['nr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/procthor_only.yaml ================================================ ### # ProcTHOR pretrain from scratch ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "procthoronly" train: ['ProcThorSpatialRefer'] val: ['ProcThorSpatialRefer'] test: ['ProcThorSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] ProcThorSpatialRefer: train: sources: [ 'rel2_template','relm_template','star_template' ] val: sources: [ 'rel2_template', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' procthor_base: '/scratch/masaccio/Procthor' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b512_Pretrain_ProcThorPretrainObj_1115procthor/2023-11-15-12:02:33.790890/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/s3d_only.yaml ================================================ ### # Structured3D pretrain from scratch ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "s3donly" train: ['ScanNetSpatialRefer'] val: ['S3DSpatialRefer'] test: ['S3DSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: True max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b512_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj+S3DPretrainObj_1113scannetws3d/2023-11-14-09:29:10.796592/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/scanrefer_only.yaml ================================================ ### # ScanRefer pretrain from scratch ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "scanrefer" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: False max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/scanrefer_only_gttest.yaml ================================================ ### # ScanRefer pretrain from scratch ### # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/home/baoxiong/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "scanrefer" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'pred' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: False max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'scanrefer' ] referit3d: anno_type: ['sr3d', 'nr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['scanrefer'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/mnt/fillipo/scratch/masaccio/existing_datasets/scannet" rscan_base: "/mnt/fillipo/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/mnt/fillipo/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/mnt/fillipo/scratch/masaccio/existing_datasets/multiscan' hm_base: '/mnt/fillipo/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/mnt/fillipo/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'PretrainEval' val: name: 'ScanReferEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: "" num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/mnt/fillipo/baoxiong/results/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: configs/final/sr3d_only.yaml ================================================ # Experiment general info name: "FinalOVPretrain" rng_seed: 42 num_gpu: 8 mode: "train" note: "" # Choose keywords to feature your saving directory naming_keywords: ["dataloader.batchsize", "task", "note", "time"] base_dir: "/scratch/masaccio/results" exp_dir: "" save_frequency: 10 resume: False debug: flag: False debug_size: 20 hard_debug: False logger: name: "wandb" entity: "bigai-gvl" # dataset details data: note: "sr3d" train: ['ScanNetSpatialRefer'] val: ['ScanNetSpatialRefer'] test: ['ScanNetSpatialRefer'] args: max_obj_len: 80 max_seq_len: 50 num_points: 1024 pc_type: 'gt' sem_type: '607' filter_lang: False txt_mask_ratio: 0.15 pc_mask_ratio: 0.1 rot_aug: True mask_strategy: random use_scene_cap: False max_scene_cap_len: 300 ScanNetSpatialRefer: train: sources: [ 'referit3d' ] referit3d: anno_type: ['sr3d'] sr3d_plus_aug: True sgrefer: anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] # sgcaption: anno_type: ['gpt'] val: sources: ['referit3d'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt_chain' sgcaption: anno_type: ['gpt'] test: sources: ['referit3d'] referit3d: anno_type: ['sr3d'] # 'nr3d', 'sr3d' sr3d_plus_aug: False sgrefer: anno_type: ['template'] # 'template', 'gpt', 'gpt_chain' sgcaption: anno_type: ['gpt'] RScanSpatialRefer: train: sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] MultiScanSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ] ARKitSceneSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] HMSpatialRefer: train: sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt'] val: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] S3DSpatialRefer: train: sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ] val: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] test: sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ] use_voxel: False scan_family_base: "/scratch/masaccio/existing_datasets/scannet" rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base" arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes' multiscan_base: '/scratch/masaccio/existing_datasets/multiscan' hm_base: '/scratch/masaccio/existing_datasets/HM3D' s3d_base: '/scratch/masaccio/existing_datasets/Structured3D' data_aug: aug_list: ['scene_aug'] scene_aug: translation: enabled: False value: [1.0, 1.0, 1.0] p: 1.0 scaling: enabled: False p: 1.0 value: [0.9, 1.1] flip: enabled: False p: 0.5 rotation: enabled: True p: 1.0 axis_align: True value: [0.0, 0.0, 1.0] shuffle: True color_jitter: False order_shuffle: False obj_aug: translation: enabled: False value: [0.1, 0.1, 0.1] p: 1.0 rotation: enabled: False p: 1.0 axis_align: False value: [0.0, 0.0, 0.1] shuffle: True random_jitter: enabled: False value: 0.01 accord_to_size: False p: 1.0 pts_shuffle: True # task details: 'Pretrain', 'scanqa', 'spatialrefer' task: 'Pretrain' # 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper' data_wrapper: train: 'MaskDatasetWrapper' val: 'ScanFamilyDatasetWrapperOld' test: 'ScanFamilyDatasetWrapperOld' # Training details trainer: "OpenVocabTrainer" ckpt_path: "" pretrain_ckpt_path: "" # dataloader details dataloader: batchsize: 64 num_workers: 4 balance_dataset: False filter_empty_annotations: False solver: gradient_accumulation_steps: 1 epochs_per_save: 20 epochs_per_eval: 1 lr: 5e-4 grad_norm: 5.0 epochs: 150 optim: name: 'AdamW' args: betas: [0.9, 0.98] sched: name: 'warmup_cosine' args: warmup_steps: 500 minimum_ratio: 0.1 eval: train: name: 'ReferIt3DEval' val: name: 'ReferIt3DEval' save: False # Model details model: name: OpenVocab language: # This part could be further optimized to be using # huggingface yaml config files name: 'BERTLanguageEncoder' args: weights: 'bert-base-uncased' hidden_size: 768 num_hidden_layers: 4 num_attention_heads: 12 type_vocab_size: 2 lr: 1e-5 vision: name: 'PointOpenVocabEncoder' args: backbone: 'pointnet++' hidden_size: 768 freeze: True path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth' num_attention_heads: 12 spatial_dim: 5 num_layers: 4 dim_loc: 6 dim_feedforward: 2048 attn_type: spatial pairwise_rel_type: 'center' use_matmul_label: False lang_type: 'bert' lang_path: '/scratch/masaccio/607_text_embeddings' lr: 1e-4 grounding: name: 'UnifiedSpatialCrossEncoderV2' args: hidden_size: 768 num_attention_heads: 12 num_layers: 4 dim_feedforward: 2048 dim_loc: 6 lr: 1e-4 inter: before heads: head_list: ['pretrain_head'] pretrain_head: name: 'OVPretrainHead' args: hidden_size: 768 vocab_size: 30522 loss_type: 'ListLoss' loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] vis_loss_list: [ 'lm_cls_loss', 'TextObjWithinBatch', # 'TextObjBetweenBatch', 'TextSceneBetweenBatch' ] ================================================ FILE: data/__init__.py ================================================ from .datasets import * from .data_utils import * ================================================ FILE: data/build.py ================================================ from omegaconf import OmegaConf from torch.utils.data import DataLoader, default_collate, ConcatDataset from fvcore.common.registry import Registry from .datasets.dataset_wrapper import DATASETWRAPPER_REGISTRY DATASET_REGISTRY = Registry("dataset") DATASET_REGISTRY.__doc__ = """ Registry for datasets, which takes a list of dataset names and returns a dataset object. Currently it performs similar as registering dataset loading functions, but remains in a form of object class for future purposes. """ def get_dataset(cfg, split): assert cfg.data.get(split), f"No valid dataset name in {split}." dataset_list = [] print(split, ': ', ', '.join(cfg.data.get(split))) for dataset_name in cfg.data.get(split): _dataset = DATASET_REGISTRY.get(dataset_name)(cfg, split) assert len(_dataset), f"Dataset '{dataset_name}' is empty!" wrapper = cfg.data_wrapper.get(split, cfg.data_wrapper) if not isinstance(cfg.data_wrapper, str) else cfg.data_wrapper _dataset = DATASETWRAPPER_REGISTRY.get(wrapper)(cfg, _dataset, split=split) # Conduct voxelization # TODO: fix voxel config if cfg.data.get('use_voxel', None): _dataset = DATASETWRAPPER_REGISTRY.get('VoxelDatasetWrapper')(cfg, _dataset) dataset_list.append(_dataset) print('='*50) print('Dataset\t\t\tSize') total = sum([len(dataset) for dataset in dataset_list]) for dataset_name, dataset in zip(cfg.data.get(split), dataset_list): print(f'{dataset_name:<20} {len(dataset):>6} ({len(dataset) / total * 100:.1f}%)') print(f'Total\t\t\t{total}') print('='*50) if split == 'train': dataset_list = ConcatDataset(dataset_list) return dataset_list def build_dataloader(cfg, split='train'): """_summary_ Unittest: dataloader_train = build_dataloader(default_cfg, split='train') for _item in dataloader_train: print(_item.keys()) Args: cfg (_type_): _description_ split (str, optional): _description_. Defaults to 'train'. Returns: _type_: _description_ """ if split == 'train': dataset = get_dataset(cfg, split) return DataLoader(dataset, batch_size=cfg.dataloader.batchsize, num_workers=cfg.dataloader.num_workers, collate_fn=getattr(dataset.datasets[0], 'collate_fn', default_collate), pin_memory=True, # TODO: Test speed # prefetch_factor=2 if not cfg.debug.flag else None, persistent_workers=True if not cfg.debug.flag else None, shuffle=True, drop_last=True) else: loader_list = [] for dataset in get_dataset(cfg, split): loader_list.append( DataLoader(dataset, batch_size=cfg.dataloader.get('batchsize_eval', cfg.dataloader.batchsize), num_workers=cfg.dataloader.num_workers, collate_fn=getattr(dataset, 'collate_fn', default_collate), pin_memory=True, # TODO: Test speed # prefetch_factor=2 if not cfg.debug.flag else None, persistent_workers=True if not cfg.debug.flag else None, shuffle=False)) # TODO: temporary solution for backward compatibility. if len(loader_list) == 1: return loader_list[0] else: return loader_list if __name__ == '__main__': pass ================================================ FILE: data/data_utils.py ================================================ import random import csv from collections import Counter import re import numpy as np import torch from data.datasets.constant import VALID_CLASS_IDS_200 def per_scene_pad(lang_list, max_len=64, tokenizer=None, max_seq_len=50): """ @param lang_list: lang json for all sentences, must include scan_id in the json element @param max_len: the number for padding, default is 64 @return: a list of list, with each element in the list containing max_len number of sentences corresponding to one scene """ scene_list = {} if tokenizer is not None: for key in ["utterance", "question", "description"]: if key in lang_list[0].keys(): encoded_input = tokenizer( [item[key] for item in lang_list], padding="max_length", truncation=True, max_length=max_seq_len ) lang_list = [ { k : (v, encoded_input["input_ids"][idx], encoded_input["attention_mask"][idx]) if k == key else v for k, v in item.items() } for idx, item in enumerate(lang_list) ] for item in lang_list: scan_id = item["scan_id"] if scan_id not in scene_list.keys(): scene_list[scan_id] = [item] else: scene_list[scan_id].append(item) final_list = [] for key, value in scene_list.items(): for index in range(0, len(value), max_len): if index + max_len < len(value): final_list.append(value[index:index + max_len]) else: content = value[index:] sampled = random.choices(content, k=max_len) final_list.append(sampled) return final_list def merge_tokens(token1, mask1, token2, mask2, max_len=300, tokenizer=None): assert len(token1) > len(token2), "not appendable" assert tokenizer is not None, "should pass in a tokenizer" len_token1 = sum(mask1) - 1 # remove the last [CLS] len_token2 = sum(mask2) - 1 # remove the first [BOS] insert_length = min(max_len - len_token1, len_token2) token1[len_token1: len_token1 + insert_length] = token2[1: 1 + insert_length] mask1[len_token1: len_token1 + insert_length] = mask2[1: 1 + insert_length] if token1[sum(mask1) - 1] != tokenizer.sep_token_id: token1[sum(mask1) - 1] = tokenizer.sep_token_id return token1, mask1 def convert_pc_to_box(obj_pc): xmin = np.min(obj_pc[:,0]) ymin = np.min(obj_pc[:,1]) zmin = np.min(obj_pc[:,2]) xmax = np.max(obj_pc[:,0]) ymax = np.max(obj_pc[:,1]) zmax = np.max(obj_pc[:,2]) center = [(xmin+xmax)/2, (ymin+ymax)/2, (zmin+zmax)/2] box_size = [xmax-xmin, ymax-ymin, zmax-zmin] return center, box_size # input txt_ids, txt_masks def random_word(tokens, tokens_mask, tokenizer, mask_ratio): output_label = [] output_tokens = tokens.clone() for i, token in enumerate(tokens): if tokens_mask[i] == 0: output_label.append(-1) else: prob = random.random() # mask token with 15% probability if prob < mask_ratio: prob /= mask_ratio # 80% randomly change token to mask token if prob < 0.8: output_tokens[i] = tokenizer.mask_token_id # 10% randomly change token to random token elif prob < 0.9: output_tokens[i] = random.choice(list(tokenizer.vocab.items()))[1] # -> rest 10% randomly keep current token # append current token to output (we will predict these later) output_label.append(token.item()) else: # no masking token (will be ignored by loss function later) output_label.append(-1) output_label = torch.Tensor(output_label).long() return output_tokens, output_label def random_point_cloud(pcd, pcd_mask, mask_ratio): assert len(pcd) == len(pcd_mask) output_mask = [] for i in range(len(pcd)): if pcd_mask[i] == 0: output_mask.append(0) else: prob = random.random() if prob < mask_ratio: output_mask.append(0) else: output_mask.append(1) output_mask = torch.tensor(output_mask, dtype=torch.bool) return output_mask class LabelConverter(object): def __init__(self, file_path): self.raw_name_to_id = {} self.nyu40id_to_id = {} self.nyu40_name_to_id = {} self.scannet_name_to_scannet_id = {'cabinet':0, 'bed':1, 'chair':2, 'sofa':3, 'table':4, 'door':5, 'window':6,'bookshelf':7,'picture':8, 'counter':9, 'desk':10, 'curtain':11, 'refrigerator':12, 'shower curtain':13, 'toilet':14, 'sink':15, 'bathtub':16, 'others':17} self.id_to_scannetid = {} self.scannet_raw_id_to_raw_name = {} with open(file_path, encoding='utf-8') as fd: rd = list(csv.reader(fd, delimiter="\t", quotechar='"')) for i in range(1, len(rd)): raw_id = i - 1 scannet_raw_id = int(rd[i][0]) raw_name = rd[i][1] nyu40_id = int(rd[i][4]) nyu40_name = rd[i][7] self.raw_name_to_id[raw_name] = raw_id self.scannet_raw_id_to_raw_name[scannet_raw_id] = raw_name self.nyu40id_to_id[nyu40_id] = raw_id self.nyu40_name_to_id[nyu40_name] = raw_id if nyu40_name not in self.scannet_name_to_scannet_id: self.id_to_scannetid[raw_id] = self.scannet_name_to_scannet_id['others'] else: self.id_to_scannetid[raw_id] = self.scannet_name_to_scannet_id[nyu40_name] ### add instance id from org image to pth file self.orgInstID_to_id = {id : id - 1 for id in range(1, 257)} self.orgInstID_to_id[0] = -100 # add map for scannet 200 self.scannet_raw_id_to_scannet200_id = {} self.scannet200_id_to_scannet_raw_id = {} for v, k in enumerate(VALID_CLASS_IDS_200): self.scannet_raw_id_to_scannet200_id[k] = v self.scannet200_id_to_scannet_raw_id[v] = k def build_rotate_mat(split, rot_aug=True, rand_angle='axis'): if rand_angle == 'random': theta = np.random.rand() * np.pi * 2 else: ROTATE_ANGLES = [0, np.pi/2, np.pi, np.pi*3/2] theta_idx = np.random.randint(len(ROTATE_ANGLES)) theta = ROTATE_ANGLES[theta_idx] if (theta is not None) and (theta != 0) and (split == 'train') and rot_aug: rot_matrix = np.array([ [np.cos(theta), -np.sin(theta), 0], [np.sin(theta), np.cos(theta), 0], [0, 0, 1] ], dtype=np.float32) else: rot_matrix = None return rot_matrix def eval_ref_one_sample(pred_bbox, gt_bbox): """ Evaluate one reference prediction Args: pred_bbox: 8 corners of prediction bounding box, (8, 3) gt_bbox: 8 corners of ground truth bounding box, (8, 3) Returns: iou: intersection over union score """ iou = box3d_iou(pred_bbox, gt_bbox) return iou def get_box3d_min_max(corner): ''' Compute min and max coordinates for 3D bounding box Note: only for axis-aligned bounding boxes Input: corners: numpy array (8,3), assume up direction is Z (batch of N samples) Output: box_min_max: an array for min and max coordinates of 3D bounding box IoU ''' min_coord = corner.min(axis=0) max_coord = corner.max(axis=0) x_min, x_max = min_coord[0], max_coord[0] y_min, y_max = min_coord[1], max_coord[1] z_min, z_max = min_coord[2], max_coord[2] return x_min, x_max, y_min, y_max, z_min, z_max def box3d_iou(corners1, corners2): ''' Compute 3D bounding box IoU. Input: corners1: numpy array (8,3), assume up direction is Z corners2: numpy array (8,3), assume up direction is Z Output: iou: 3D bounding box IoU ''' x_min_1, x_max_1, y_min_1, y_max_1, z_min_1, z_max_1 = get_box3d_min_max(corners1) x_min_2, x_max_2, y_min_2, y_max_2, z_min_2, z_max_2 = get_box3d_min_max(corners2) xA = np.maximum(x_min_1, x_min_2) yA = np.maximum(y_min_1, y_min_2) zA = np.maximum(z_min_1, z_min_2) xB = np.minimum(x_max_1, x_max_2) yB = np.minimum(y_max_1, y_max_2) zB = np.minimum(z_max_1, z_max_2) inter_vol = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0) * np.maximum((zB - zA), 0) box_vol_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1) * (z_max_1 - z_min_1) box_vol_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2) * (z_max_2 - z_min_2) iou = inter_vol / (box_vol_1 + box_vol_2 - inter_vol + 1e-8) return iou def transform_points(points, transform, translate=True, mode="numpy"): """ Apply linear transform to a np array of points. Args: points (np array [..., 3]): Points to transform. transform (np array [3, 4] or [4, 4]): Linear map. translate (bool): If false, do not apply translation component of transform. Returns: transformed points (np array [..., 3]) """ # Append ones or zeros to get homogenous coordinates if translate: if mode == "numpy": constant_term = np.ones_like(points[..., :1]) else: constant_term = torch.ones_like(points[..., :1]) else: if mode == "numpy": constant_term = np.zeros_like(points[..., :1]) else: constant_term = torch.zeros_like(points[..., :1]) if mode == "numpy": points = np.concatenate((points, constant_term), axis=-1) points = np.einsum('nm,...m->...n', transform, points) else: points = torch.cat((points, constant_term), dim=-1) points = torch.einsum('...nm,...m->...n', transform, points) return points[..., :3] def construct_bbox_corners(center, box_size): sx, sy, sz = box_size x_corners = [sx/2, sx/2, -sx/2, -sx/2, sx/2, sx/2, -sx/2, -sx/2] y_corners = [sy/2, -sy/2, -sy/2, sy/2, sy/2, -sy/2, -sy/2, sy/2] z_corners = [sz/2, sz/2, sz/2, sz/2, -sz/2, -sz/2, -sz/2, -sz/2] corners_3d = np.vstack([x_corners, y_corners, z_corners]) corners_3d[0,:] = corners_3d[0,:] + center[0] corners_3d[1,:] = corners_3d[1,:] + center[1] corners_3d[2,:] = corners_3d[2,:] + center[2] corners_3d = np.transpose(corners_3d) return corners_3d def is_explicitly_view_dependent(tokens): """ :return: a boolean mask """ target_words = {'front', 'behind', 'back', 'right', 'left', 'facing', 'leftmost', 'rightmost', 'looking', 'across'} for token in tokens: if token in target_words: return True return False class ScanQAAnswer(object): def __init__(self, answers=None, unk_token='', ignore_idx=-100): if answers is None: answers = [] self.unk_token = unk_token self.ignore_idx = ignore_idx self.vocab = {x: i for i, x in enumerate(answers)} self.rev_vocab = dict((v, k) for k, v in self.vocab.items()) def itos(self, i): if i == self.ignore_idx: return self.unk_token return self.rev_vocab[i] def stoi(self, v): if v not in self.vocab: return self.ignore_idx return self.vocab[v] def __len__(self): return len(self.vocab) class SQA3DAnswer(object): def __init__(self, answers=None, unk_token='u'): if answers is None: answers = [] self.vocab = {x: i for i, x in enumerate(answers)} self.rev_vocab = dict((v, k) for k, v in self.vocab.items()) self.unk_token = unk_token self.ignore_idx = self.vocab['u'] def itos(self, i): if i == self.ignore_idx: return self.unk_token return self.rev_vocab[i] def stoi(self, v): if v not in self.vocab: return self.ignore_idx return self.vocab[v] def __len__(self): return len(self.vocab) def load_matrix_from_txt(path, shape=(4, 4)): with open(path) as f: txt = f.readlines() txt = ''.join(txt).replace('\n', ' ') matrix = [float(v) for v in txt.split()] return np.array(matrix).reshape(shape) def pad_tensors(tensors, lens=None, pad=0): assert tensors.shape[0] <= lens if tensors.shape[0] == lens: return tensors shape = list(tensors.shape) shape[0] = lens - shape[0] res = torch.ones(shape, dtype=tensors.dtype) * pad res = torch.cat((tensors, res), dim=0) return res def get_sqa_question_type(question): question = question.lstrip() if question[:4].lower() == 'what': return 0 elif question[:2].lower() == 'is': return 1 elif question[:3].lower() == 'how': return 2 elif question[:3].lower() == 'can': return 3 elif question[:5].lower() == 'which': return 4 else: return 5 # others class Vocabulary(object): def __init__(self, path=None): self.vocab = {} self.id_to_vocab = {} self.id_to_bert = {} if path is not None: load_dict = torch.load(path) self.vocab = load_dict['vocab'] self.id_to_vocab = load_dict['id_to_vocab'] self.id_to_bert = load_dict['id_to_bert'] def add_token(self, token, bert_id): if token in self.vocab.keys(): return id = len(self.vocab) self.vocab[token] = id self.id_to_vocab[id] = token self.id_to_bert[id] = bert_id def token_to_id(self, token): return self.vocab[token] def id_to_token(self, id): return self.id_to_vocab[id] def id_to_bert_id(self, id): return self.id_to_bert[id] def save_vocab(self, path): save_dict = {'vocab': self.vocab, "id_to_vocab": self.id_to_vocab, "id_to_bert": self.id_to_bert} torch.save(save_dict, path) def random_caption_word(tokens, tokens_mask, tokenizer, vocab, mask_ratio): output_label = [] output_tokens = tokens.clone() for i, token in enumerate(tokens): # 101 cls 102 sep use them as SOS and EOS token if tokens_mask[i] == 0 or token == 101: output_label.append(-1) elif token == 102: output_tokens[i] = tokenizer.mask_token_id output_label.append(vocab.token_to_id('[EOS]')) else: prob = random.random() # mask token with 15% probability if prob < mask_ratio: output_tokens[i] = tokenizer.mask_token_id output_label.append(vocab.token_to_id(tokenizer.decode([tokens[i]]))) else: # no masking token (will be ignored by loss function later) output_label.append(-1) output_label = torch.Tensor(output_label).long() return output_tokens, output_label def clean_answer(data): data = data.lower() data = re.sub('[ ]+$' ,'', data) data = re.sub('^[ ]+' ,'', data) data = re.sub(' {2,}', ' ', data) data = re.sub('\.[ ]{2,}', '. ', data) data = re.sub('[^a-zA-Z0-9,\'\s\-:]+', '', data) data = re.sub('ç' ,'c', data) data = re.sub('’' ,'\'', data) data = re.sub(r'\bletf\b' ,'left', data) data = re.sub(r'\blet\b' ,'left', data) data = re.sub(r'\btehre\b' ,'there', data) data = re.sub(r'\brigth\b' ,'right', data) data = re.sub(r'\brght\b' ,'right', data) data = re.sub(r'\bbehine\b', 'behind', data) data = re.sub(r'\btv\b' ,'TV', data) data = re.sub(r'\bchai\b' ,'chair', data) data = re.sub(r'\bwasing\b' ,'washing', data) data = re.sub(r'\bwaslked\b' ,'walked', data) data = re.sub(r'\boclock\b' ,'o\'clock', data) data = re.sub(r'\bo\'[ ]+clock\b' ,'o\'clock', data) # digit to word, only for answer data = re.sub(r'\b0\b', 'zero', data) data = re.sub(r'\bnone\b', 'zero', data) data = re.sub(r'\b1\b', 'one', data) data = re.sub(r'\b2\b', 'two', data) data = re.sub(r'\b3\b', 'three', data) data = re.sub(r'\b4\b', 'four', data) data = re.sub(r'\b5\b', 'five', data) data = re.sub(r'\b6\b', 'six', data) data = re.sub(r'\b7\b', 'seven', data) data = re.sub(r'\b8\b', 'eight', data) data = re.sub(r'\b9\b', 'nine', data) data = re.sub(r'\b10\b', 'ten', data) data = re.sub(r'\b11\b', 'eleven', data) data = re.sub(r'\b12\b', 'twelve', data) data = re.sub(r'\b13\b', 'thirteen', data) data = re.sub(r'\b14\b', 'fourteen', data) data = re.sub(r'\b15\b', 'fifteen', data) data = re.sub(r'\b16\b', 'sixteen', data) data = re.sub(r'\b17\b', 'seventeen', data) data = re.sub(r'\b18\b', 'eighteen', data) data = re.sub(r'\b19\b', 'nineteen', data) data = re.sub(r'\b20\b', 'twenty', data) data = re.sub(r'\b23\b', 'twenty-three', data) # misc # no1, mat2, etc data = re.sub(r'\b([a-zA-Z]+)([0-9])\b' ,r'\g<1>', data) data = re.sub(r'\ba\b ([a-zA-Z]+)' ,r'\g<1>', data) data = re.sub(r'\ban\b ([a-zA-Z]+)' ,r'\g<1>', data) data = re.sub(r'\bthe\b ([a-zA-Z]+)' ,r'\g<1>', data) data = re.sub(r'\bbackwards\b', 'backward', data) return data if __name__ == "__main__": path = "/home/baoxiong/Desktop/gpt_gen_language.json" import json with open(path, "r") as f: data = json.load(f) padded = per_scene_pad(data) print(padded) ================================================ FILE: data/datasets/__init__.py ================================================ from .scannet import * from .rscan import * from .arkitscene import * from .hm import * from .multiscan import * from .procthor import * from .structure3d import * ================================================ FILE: data/datasets/arkitscene.py ================================================ import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class ARKitScenePretrainObj(ScanBase): def __init__(self, cfg, split): super(ARKitScenePretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.arkitscene_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading ARKitScene {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading ARKitScene {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'arkitscene' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class ARKitSceneSpatialRefer(ScanBase): def __init__(self, cfg, split): super(ARKitSceneSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.arkitscene_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading ARKitScene {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading ARKitScene {split}-set language of size {self.__len__()}") print(f"Loading ARKitScene {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading ARKitScene {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: data/datasets/base.py ================================================ import os import copy import json import jsonlines import random from tqdm import tqdm import numpy as np from scipy import sparse import torch from torch.utils.data import Dataset from ..data_utils import LabelConverter, build_rotate_mat from ..data_utils import convert_pc_to_box, construct_bbox_corners, \ merge_tokens, eval_ref_one_sample, is_explicitly_view_dependent from .data_augmentor import DataAugmentor from .constant import CLASS_LABELS_200 class ScanBase(Dataset): def __init__(self, cfg, split): self.cfg = cfg self.split = split self.pc_type = cfg.data.args.pc_type self.max_obj_len = cfg.data.args.max_obj_len self.num_points = cfg.data.args.num_points self.rot_aug = cfg.data.args.rot_aug self.aug_cfg = getattr(cfg, 'data_aug', None) self.debug = cfg.debug.flag self.debug_size = cfg.debug.debug_size self.subset_ratio = getattr(cfg.data.args, 'subset_ratio', 0) if self.aug_cfg: self.augmentor = DataAugmentor(self.aug_cfg, self.split) self.scannet_dir = cfg.data.scan_family_base assert self.split in ['train', 'val', 'test'] if self.split == 'train': self.pc_type = 'gt' # TODO: hack test split to be the same as val if self.split == 'test': self.split = 'val' self.int2cat = json.load(open(os.path.join(self.scannet_dir, "annotations/meta_data/scannetv2_raw_categories.json"), 'r', encoding="utf-8")) self.cat2int = {w: i for i, w in enumerate(self.int2cat)} self.label_converter = LabelConverter(os.path.join(self.scannet_dir, "annotations/meta_data/scannetv2-labels.combined.tsv")) self.use_scene_cap = getattr(cfg.data.args, 'use_scene_cap', False) def _load_split(self, split): # TODO: temporarily reproducing # split_file = os.path.join(self.base_dir, 'annotations/splits/'+ split + "_split_non_overlap.txt") if 'scannet' in self.__class__.__name__.lower(): split_file = os.path.join(self.base_dir, 'annotations/splits/scannetv2_'+ split + ".txt") else: split_file = os.path.join(self.base_dir, 'annotations/splits/'+ split + "_split.txt") scan_ids = {x.strip() for x in open(split_file, 'r', encoding="utf-8")} scan_ids = sorted(scan_ids) return scan_ids def _load_scan(self, scan_ids, filter_bkg=False): scans = {} for scan_id in tqdm(scan_ids): pcd_path = os.path.join(self.base_dir, 'scan_data', 'pcd_with_global_alignment', f'{scan_id}.pth') inst2label_path = os.path.join(self.base_dir, 'scan_data', 'instance_id_to_label', f'{scan_id}.pth') if not os.path.exists(pcd_path): continue pcd_data = torch.load(pcd_path) points, colors, instance_labels = pcd_data[0], pcd_data[1], pcd_data[-1] colors = colors / 127.5 - 1 pcds = np.concatenate([points, colors], 1) # build obj_pcds inst_to_label = torch.load(inst2label_path) obj_pcds = [] inst_ids = [] inst_labels = [] bg_indices = np.full((points.shape[0], ), 1, dtype=np.bool_) for inst_id in inst_to_label.keys(): if inst_to_label[inst_id] in self.cat2int.keys(): mask = instance_labels == inst_id if np.sum(mask) == 0: continue obj_pcds.append(pcds[mask]) inst_ids.append(inst_id) inst_labels.append(self.cat2int[inst_to_label[inst_id]]) if inst_to_label[inst_id] not in ['wall', 'floor', 'ceiling']: bg_indices[mask] = False if filter_bkg: selected_obj_idxs = [i for i, obj_label in enumerate(inst_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] if len(selected_obj_idxs) == 0: continue scans[scan_id] = {} # scans[scan_id]['scene_pcds'] = pcds scans[scan_id]['obj_pcds'] = obj_pcds scans[scan_id]['inst_labels'] = inst_labels scans[scan_id]['inst_ids'] = inst_ids scans[scan_id]['bg_pcds'] = pcds[bg_indices] # calculate box for matching obj_center = [] obj_box_size = [] for obj_pcd in obj_pcds: _c, _b = convert_pc_to_box(obj_pcd) obj_center.append(_c) obj_box_size.append(_b) scans[scan_id]['obj_center'] = obj_center scans[scan_id]['obj_box_size'] = obj_box_size # load pred pcds obj_mask_path = os.path.join(self.base_dir, "mask", str(scan_id) + ".mask" + ".npz") if os.path.exists(obj_mask_path): obj_label_path = os.path.join(self.base_dir, "mask", str(scan_id) + ".label" + ".npy") obj_pcds = [] obj_mask = np.array(sparse.load_npz(obj_mask_path).todense())[:50, :] obj_labels = np.load(obj_label_path)[:50] obj_l = [] bg_indices = np.full((pcds.shape[0], ), 1, dtype=np.bool_) for i in range(obj_mask.shape[0]): mask = obj_mask[i] if pcds[mask == 1, :].shape[0] > 0: obj_pcds.append(pcds[mask == 1, :]) obj_l.append(obj_labels[i]) # if not self.int2cat[obj_labels[i]] in ['wall', 'floor', 'ceiling']: bg_indices[mask == 1] = False scans[scan_id]['obj_pcds_pred'] = obj_pcds scans[scan_id]['inst_labels_pred'] = obj_l scans[scan_id]['bg_pcds_pred'] = pcds[bg_indices] # calculate box for pred obj_center_pred = [] obj_box_size_pred = [] for obj_pcd in obj_pcds: _c, _b = convert_pc_to_box(obj_pcd) obj_center_pred.append(_c) obj_box_size_pred.append(_b) scans[scan_id]['obj_center_pred'] = obj_center_pred scans[scan_id]['obj_box_size_pred'] = obj_box_size_pred return scans def _load_lang(self, cfg, scan_ids): caption_source = cfg.sources json_data = [] lang_data = [] valid_scan_ids = [] if self.use_scene_cap: scene_cap_file = os.path.join(self.base_dir, 'annotations/scene_cap.json') if not os.path.exists(scene_cap_file): self.scene_caps = {} else: with open(scene_cap_file, 'r') as f: self.scene_caps = json.load(f) else: self.scene_caps = None for anno_type in caption_source: if anno_type == 'anno': anno_file = os.path.join(self.base_dir, 'annotations/anno.json') json_data.extend(json.load(open(anno_file, 'r', encoding='utf-8'))) elif anno_type == 'referit3d': for anno_type in cfg.referit3d.anno_type: anno_file = os.path.join(self.base_dir, f'annotations/refer/{anno_type}.jsonl') with jsonlines.open(anno_file, 'r') as _f: for item in _f: if len(item['tokens']) <= 24: json_data.append(item) if cfg.referit3d.sr3d_plus_aug: anno_file = os.path.join(self.base_dir, 'annotations/refer/sr3d+.jsonl') with jsonlines.open(anno_file, 'r') as _f: for item in _f: if len(item['tokens']) <= 24: json_data.append(item) elif anno_type == 'scanrefer': anno_file = os.path.join(self.base_dir, 'annotations/refer/scanrefer.jsonl') with jsonlines.open(anno_file, 'r') as _f: for item in _f: json_data.append(item) elif anno_type == 'sgrefer': for anno_type in cfg.sgrefer.anno_type: anno_file = os.path.join(self.base_dir, f'annotations/refer/ssg_ref_{anno_type}.json') json_data.extend(json.load(open(anno_file, 'r', encoding='utf-8'))) elif anno_type == 'sgcaption': for anno_type in cfg.sgcaption.anno_type: anno_file = os.path.join(self.base_dir, f'annotations/refer/ssg_obj_caption_{anno_type}.json') json_data.extend(json.load(open(anno_file, 'r', encoding='utf-8'))) else: if 'obj_caption' in anno_type: anno_file = os.path.join(self.base_dir, f'annotations/ssg_{anno_type}.json') else: anno_file = os.path.join(self.base_dir, f'annotations/ssg_ref_{anno_type}.json') json_data.extend(json.load(open(anno_file, 'r', encoding='utf-8'))) for item in json_data: if item['scan_id'] in scan_ids and item['instance_type'] not in ['wall', 'floor', 'ceiling']: lang_data.append(item) if item['scan_id'] not in valid_scan_ids: valid_scan_ids.append(item['scan_id']) valid_scan_ids = sorted(valid_scan_ids) if self.subset_ratio > 0: valid_scan_ids = valid_scan_ids[:int(self.subset_ratio*len(valid_scan_ids))] lang_data = [item for item in lang_data if item['scan_id'] in valid_scan_ids] if self.debug and self.debug_size != -1: valid_scan_ids = valid_scan_ids[:self.debug_size] lang_data = [item for item in lang_data if item['scan_id'] in valid_scan_ids] return lang_data, valid_scan_ids def _getitem_pretrain(self, index, is_rscan=False): item = self.lang_data[index] scan_id = item['scan_id'] if is_rscan and hasattr(item, 'sentence'): sentence = item['sentence'] else: sentence = item['utterance'] # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # filter out background selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # crop objects if self.max_obj_len < len(obj_pcds): remained_obj_idx = [i for i in range(len(obj_pcds))] random.shuffle(remained_obj_idx) selected_obj_idxs = remained_obj_idx[:self.max_obj_len] # reorganize ids obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] assert len(obj_pcds) == self.max_obj_len if not self.aug_cfg: obj_fts, obj_locs, _, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True, rot_aug=self.rot_aug) else: obj_fts, obj_locs, _, obj_labels = self._obj_processing_aug(obj_pcds, obj_labels, is_need_bbox=True) data_dict = {'scan_id': scan_id, 'sentence': sentence, 'obj_fts': obj_fts, 'obj_locs': obj_locs, 'obj_labels': obj_labels} return data_dict def _getitem_obj_pretrain(self, index): scan_id = self.scan_ids[index] sentence = 'placeholder' # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # filter out background selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] in CLASS_LABELS_200) and (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # crop objects if self.max_obj_len < len(obj_pcds): remained_obj_idx = [i for i in range(len(obj_pcds))] random.shuffle(remained_obj_idx) selected_obj_idxs = remained_obj_idx[:self.max_obj_len] # reorganize ids obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] assert len(obj_pcds) == self.max_obj_len if not self.load_scene_pcds: if not self.aug_cfg: obj_fts, obj_locs, _, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True, rot_aug=self.rot_aug) else: obj_fts, obj_locs, _, obj_labels = self._obj_processing_aug(obj_pcds, obj_labels, is_need_bbox=True) else: assert self.aug_cfg bg_pcds = self.scan_data[scan_id]['bg_pcds'] obj_locs, _, obj_labels, obj_pcds_masks, scene_pcds = self._scene_processing_aug(obj_pcds, bg_pcds, obj_labels, is_need_bbox=True) if not self.load_scene_pcds: data_dict = {'scan_id': scan_id, 'sentence': sentence, 'obj_fts': obj_fts, 'obj_locs': obj_locs, 'obj_labels': obj_labels} else: data_dict = {'scan_id': scan_id, 'sentence': sentence, 'obj_locs': obj_locs, 'obj_labels': obj_labels, 'obj_pcds_masks': obj_pcds_masks, 'scene_pcds': scene_pcds} return data_dict def _getitem_refer(self, index): item = self.lang_data[index] item_id = item['item_id'] scan_id = item['scan_id'] tgt_object_instance = int(item['target_id']) tgt_object_name = item['instance_type'] sentence = item['utterance'] is_view_dependent = is_explicitly_view_dependent(item['utterance'].split(' ')) if self.use_scene_cap: scene_caps = self.scene_caps.get(scan_id) if scene_caps is not None: scene_caps = scene_caps['captions'] scene_cap = scene_caps[np.random.choice(len(scene_caps))] else: scene_cap = "This is a scene." # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_ids = self.scan_data[scan_id]['inst_ids'] # N assert tgt_object_instance in obj_ids, str(tgt_object_instance) + ' not in ' + str(obj_ids) + '-' + scan_id tgt_object_id = obj_ids.index(tgt_object_instance) elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] # obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # obj_ids = self.scan_data[scan_id]['inst_ids_pred'] # N obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # get obj labels by matching gt_obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_center = self.scan_data[scan_id]['obj_center'] obj_box_size = self.scan_data[scan_id]['obj_box_size'] obj_center_pred = self.scan_data[scan_id]['obj_center_pred'] obj_box_size_pred = self.scan_data[scan_id]['obj_box_size_pred'] for i, _ in enumerate(obj_center_pred): for j, _ in enumerate(obj_center): if eval_ref_one_sample(construct_bbox_corners(obj_center[j], obj_box_size[j]), construct_bbox_corners(obj_center_pred[i], obj_box_size_pred[i])) >= 0.25: obj_labels[i] = gt_obj_labels[j] break # filter out background or language if self.filter_lang: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling']) and (self.int2cat[obj_label] in sentence)] if tgt_object_id not in selected_obj_idxs: selected_obj_idxs.append(tgt_object_id) else: selected_obj_idxs = [i for i in range(len(obj_pcds))] else: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] if tgt_object_id not in selected_obj_idxs: selected_obj_idxs.append(tgt_object_id) else: selected_obj_idxs = [i for i in range(len(obj_pcds))] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # build tgt object id and box if self.pc_type == 'gt': tgt_object_id = selected_obj_idxs.index(tgt_object_id) tgt_object_label = obj_labels[tgt_object_id] tgt_object_id_iou25_list = [tgt_object_id] tgt_object_id_iou50_list = [tgt_object_id] elif self.pc_type == 'pred': obj_ids = self.scan_data[scan_id]['inst_ids'] # N tgt_object_id = obj_ids.index(tgt_object_instance) gt_pcd = self.scan_data[scan_id]["obj_pcds"][tgt_object_id] gt_center, gt_box_size = convert_pc_to_box(gt_pcd) tgt_object_id = -1 tgt_object_id_iou25_list = [] tgt_object_id_iou50_list = [] tgt_object_label = self.cat2int[tgt_object_name] # find tgt iou 25 for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) >= 0.25: tgt_object_id = i tgt_object_id_iou25_list.append(i) # find tgt iou 50 for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) >= 0.5: tgt_object_id_iou50_list.append(i) assert len(obj_pcds) == len(obj_labels) # crop objects if self.max_obj_len < len(obj_pcds): # select target first if tgt_object_id != -1: selected_obj_idxs = [tgt_object_id] selected_obj_idxs.extend(tgt_object_id_iou25_list) selected_obj_idxs.extend(tgt_object_id_iou50_list) selected_obj_idxs = list(set(selected_obj_idxs)) # select object with same semantic class with tgt_object remained_obj_idx = [] for kobj, klabel in enumerate(obj_labels): if kobj not in selected_obj_idxs: if klabel == tgt_object_label: selected_obj_idxs.append(kobj) else: remained_obj_idx.append(kobj) if len(selected_obj_idxs) == self.max_obj_len: break if len(selected_obj_idxs) < self.max_obj_len: random.shuffle(remained_obj_idx) selected_obj_idxs += remained_obj_idx[:(self.max_obj_len - len(selected_obj_idxs))] # reorganize ids obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] if tgt_object_id != -1: tgt_object_id = selected_obj_idxs.index(tgt_object_id) tgt_object_id_iou25_list = [selected_obj_idxs.index(id) for id in tgt_object_id_iou25_list] tgt_object_id_iou50_list = [selected_obj_idxs.index(id) for id in tgt_object_id_iou50_list] assert len(obj_pcds) == self.max_obj_len # rebuild tgt_object_id if tgt_object_id == -1: tgt_object_id = len(obj_pcds) if not self.load_scene_pcds: if not self.aug_cfg: obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True, rot_aug=self.rot_aug) else: obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_aug(obj_pcds, obj_labels, is_need_bbox=True) else: assert self.aug_cfg if self.pc_type == 'pred': bg_pcds = self.scan_data[scan_id]['bg_pcds_pred'] else: bg_pcds = self.scan_data[scan_id]['bg_pcds'] obj_locs, obj_boxes, obj_labels, obj_pcds_masks, scene_pcds = self._scene_processing_aug(obj_pcds, bg_pcds, obj_labels, is_need_bbox=True) # build iou25 and iou50 tgt_object_id_iou25 = torch.zeros(len(obj_pcds) + 1).long() tgt_object_id_iou50 = torch.zeros(len(obj_pcds) + 1).long() for _id in tgt_object_id_iou25_list: tgt_object_id_iou25[_id] = 1 for _id in tgt_object_id_iou50_list: tgt_object_id_iou50[_id] = 1 # build unique multiple is_multiple = self.scan_data[scan_id]['label_count_multi'][self.label_converter.id_to_scannetid [tgt_object_label]] > 1 is_hard = self.scan_data[scan_id]['label_count'][tgt_object_label] > 2 data_dict = { "sentence": sentence, "tgt_object_id": torch.LongTensor([tgt_object_id]), # 1 "tgt_object_label": torch.LongTensor([tgt_object_label]), # 1 "obj_locs": obj_locs, # N, 3 "obj_labels": obj_labels, # N, "obj_boxes": obj_boxes, # N, 6 "data_idx": item_id, "tgt_object_id_iou25": tgt_object_id_iou25, "tgt_object_id_iou50": tgt_object_id_iou50, 'is_multiple': is_multiple, 'is_view_dependent': is_view_dependent, 'is_hard': is_hard } if self.load_scene_pcds: data_dict["scene_pcds"] = scene_pcds data_dict["obj_pcds_masks"] = obj_pcds_masks else: data_dict["obj_fts"] = obj_fts # N, 6 if self.use_scene_cap: data_dict["scene_cap"] = scene_cap return data_dict def _getitem_perscene(self, index): item = self.lang_data[index] scan_id = item[0]['scan_id'] # load lang list list_item_id = [_i['item_id'] for _i in item] list_tgt_object_instance = [int(_i['target_id']) for _i in item] list_tgt_object_name = [_i['instance_type'] for _i in item] # (sentence, token_seq, token_mask) list_sentence = [_i['utterance'][0] for _i in item] list_token = [_i['utterance'][1] for _i in item] list_mask = [_i['utterance'][2] for _i in item] list_is_view_dependent = [is_explicitly_view_dependent(sentence.split(' ')) for sentence in list_sentence] # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_ids = self.scan_data[scan_id]['inst_ids'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # get obj labels by matching gt_obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_center = self.scan_data[scan_id]['obj_center'] obj_box_size = self.scan_data[scan_id]['obj_box_size'] obj_center_pred = self.scan_data[scan_id]['obj_center_pred'] obj_box_size_pred = self.scan_data[scan_id]['obj_box_size_pred'] for i, _ in enumerate(obj_center_pred): for j, _ in enumerate(obj_center): if eval_ref_one_sample(construct_bbox_corners(obj_center[j], obj_box_size[j]), construct_bbox_corners(obj_center_pred[i], obj_box_size_pred[i])) >= 0.25: obj_labels[i] = gt_obj_labels[j] break list_tgt_object_id = [obj_ids.index(_ins) for _ins in list_tgt_object_instance] if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] else: selected_obj_idxs = [i for i in range(len(obj_pcds))] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # build tgt object id and box list_tgt_object_label = [] list_tgt_object_id_iou25_list = [] list_tgt_object_id_iou50_list = [] list_is_multiple = [] list_is_hard = [] for idx, _ in enumerate(list_item_id): item_id = list_item_id[idx] tgt_object_id = list_tgt_object_id[idx] tgt_object_name = list_tgt_object_name[idx] if self.pc_type == 'gt': tgt_object_id = selected_obj_idxs.index(tgt_object_id) tgt_object_label = obj_labels[tgt_object_id] tgt_object_id_iou25_list = [tgt_object_id] tgt_object_id_iou50_list = [tgt_object_id] assert self.int2cat[tgt_object_label] == tgt_object_name, str(self.int2cat[tgt_object_label]) + '-' + tgt_object_name elif self.pc_type == 'pred': gt_pcd = self.scan_data[scan_id]["obj_pcds"][tgt_object_id] gt_center, gt_box_size = convert_pc_to_box(gt_pcd) tgt_object_id = -1 tgt_object_id_iou25_list = [] tgt_object_id_iou50_list = [] tgt_object_label = self.cat2int[tgt_object_name] # find tgt iou 25 for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) >= 0.25: tgt_object_id = i tgt_object_id_iou25_list.append(i) # find tgt iou 50 for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) >= 0.5: tgt_object_id_iou50_list.append(i) # build unique multiple is_multiple = self.scan_data[scan_id]['label_count'][self.label_converter.id_to_scannetid [tgt_object_label]] > 1 is_hard = self.scan_data[scan_id]['label_count'][tgt_object_label] > 2 list_tgt_object_id[idx] = tgt_object_id list_tgt_object_label.append(tgt_object_label) list_tgt_object_id_iou25_list.append(tgt_object_id_iou25_list) list_tgt_object_id_iou50_list.append(tgt_object_id_iou50_list) list_is_multiple.append(is_multiple) list_is_hard.append(is_hard) # crop objects if self.max_obj_len < len(obj_pcds): # select target first selected_obj_idxs = [x for x in list_tgt_object_id if x != -1] for idx, _ in enumerate(list_tgt_object_id): selected_obj_idxs.extend(list_tgt_object_id_iou25_list[idx]) selected_obj_idxs.extend(list_tgt_object_id_iou50_list[idx]) selected_obj_idxs = list(set(selected_obj_idxs)) # select object with same semantic class with tgt_object remained_obj_idx = [] for kobj, klabel in enumerate(obj_labels): if kobj not in selected_obj_idxs: if klabel == tgt_object_label: selected_obj_idxs.append(kobj) else: remained_obj_idx.append(kobj) if len(selected_obj_idxs) == self.max_obj_len: break if len(selected_obj_idxs) < self.max_obj_len: random.shuffle(remained_obj_idx) selected_obj_idxs += remained_obj_idx[:(self.max_obj_len - len(selected_obj_idxs))] # reorganize ids obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] list_tgt_object_id_tmp = [] for tgt_object_id in list_tgt_object_id: list_tgt_object_id_tmp.append(selected_obj_idxs.index(tgt_object_id) if tgt_object_id != -1 else -1) list_tgt_object_id = list_tgt_object_id_tmp list_tgt_object_id_iou25_list_tmp = [] for tgt_object_id_iou25_list in list_tgt_object_id_iou25_list: list_tgt_object_id_iou25_list_tmp.append([selected_obj_idxs.index(id) for id in tgt_object_id_iou25_list]) list_tgt_object_id_iou25_list = list_tgt_object_id_iou25_list_tmp list_tgt_object_id_iou50_list_tmp = [] for tgt_object_id_iou50_list in list_tgt_object_id_iou50_list: list_tgt_object_id_iou50_list_tmp.append([selected_obj_idxs.index(id) for id in tgt_object_id_iou50_list]) list_tgt_object_id_iou50_list = list_tgt_object_id_iou50_list_tmp assert len(obj_pcds) == self.max_obj_len # rebuild tgt_object_id for idx, _id in enumerate(list_tgt_object_id): if _id == -1: list_tgt_object_id[idx] = len(obj_pcds) # build scene assert self.aug_cfg if self.pc_type == 'pred': bg_pcds = self.scan_data[scan_id]['bg_pcds_pred'] else: bg_pcds = self.scan_data[scan_id]['bg_pcds'] obj_locs, obj_boxes, obj_labels, obj_pcds_masks, scene_pcds = self._scene_processing_aug(obj_pcds, bg_pcds, obj_labels, is_need_bbox=True) # build iou25 and iou50 list_tgt_object_id_iou25 = torch.zeros((len(list_sentence), len(obj_pcds) + 1)).long() list_tgt_object_id_iou50 = torch.zeros((len(list_sentence), len(obj_pcds) + 1)).long() for _rid, tgt_id in enumerate(list_tgt_object_id_iou25_list): for _cid in tgt_id: list_tgt_object_id_iou25[_rid, _cid] = 1 for _rid, tgt_id in enumerate(list_tgt_object_id_iou50_list): for _cid in tgt_id: list_tgt_object_id_iou50[_rid, _cid] = 1 data_dict = { "sentence": list_sentence, # list, len L "txt_ids": list_token, # Tensor, L "txt_masks": torch.LongTensor(list_mask), # Tensor, L "tgt_object_id": torch.LongTensor(list_tgt_object_id), # Tensor, L "tgt_object_label": torch.LongTensor(list_tgt_object_label), # Tensor, L "scene_pcds": scene_pcds, # N_pts, 6 "obj_locs": obj_locs, # Tensor N, 6 "obj_labels": obj_labels, # Tensor, N "obj_boxes": obj_boxes, # Tensor, N, 6 "data_idx": item_id, # str, 1 "tgt_object_id_iou25": list_tgt_object_id_iou25, # Tensor, (L, N+1) "tgt_object_id_iou50": list_tgt_object_id_iou50, # Tensor, (L, N+1) "is_multiple": torch.LongTensor(list_is_multiple), # list, L "is_view_dependent": torch.LongTensor(list_is_view_dependent), # List, L "is_hard": torch.LongTensor(list_is_hard), # List, L "obj_pcds_masks": obj_pcds_masks # Tensor, (N, 1024) } return data_dict def _obj_processing_post(self, obj_pcds, obj_labels, is_need_bbox=False, rot_aug=False): # rotate obj rot_matrix = build_rotate_mat(self.split, rot_aug) # normalize pc and calculate location obj_fts = [] obj_locs = [] obj_boxes = [] for obj_pcd in obj_pcds: # build locs if rot_matrix is not None: obj_pcd[:, :3] = np.matmul(obj_pcd[:, :3], rot_matrix.transpose()) obj_center = obj_pcd[:, :3].mean(0) obj_size = obj_pcd[:, :3].max(0) - obj_pcd[:, :3].min(0) obj_locs.append(np.concatenate([obj_center, obj_size], 0)) # build box if is_need_bbox: obj_box_center = (obj_pcd[:, :3].max(0) + obj_pcd[:, :3].min(0)) / 2 obj_box_size = obj_pcd[:, :3].max(0) - obj_pcd[:, :3].min(0) obj_boxes.append(np.concatenate([obj_box_center, obj_box_size], 0)) # subsample pcd_idxs = np.random.choice(len(obj_pcd), size=self.num_points, replace=len(obj_pcd) < self.num_points) obj_pcd = obj_pcd[pcd_idxs] # normalize obj_pcd[:, :3] = obj_pcd[:, :3] - obj_pcd[:, :3].mean(0) max_dist = np.max(np.sqrt(np.sum(obj_pcd[:, :3]**2, 1))) if max_dist < 1e-6: # take care of tiny point-clouds, i.e., padding max_dist = 1 obj_pcd[:, :3] = obj_pcd[:, :3] / max_dist obj_fts.append(obj_pcd) # convert to torch obj_fts = torch.from_numpy(np.stack(obj_fts, 0)) obj_locs = torch.from_numpy(np.array(obj_locs)) obj_boxes = torch.from_numpy(np.array(obj_boxes)) obj_labels = torch.LongTensor(obj_labels) assert obj_labels.shape[0] == obj_locs.shape[0] assert obj_fts.shape[0] == obj_locs.shape[0] return obj_fts, obj_locs, obj_boxes, obj_labels def _obj_processing_aug(self, obj_pcds, obj_labels, is_need_bbox=False): # augment objects if self.augmentor: data_dict = self.augmentor.forward({'obj_pcds': obj_pcds, 'num_points': self.num_points}) obj_pcds = data_dict['obj_pcds'] if isinstance(obj_pcds, list): obj_pcds = torch.Tensor(np.array(obj_pcds)) obj_sizes = torch.Tensor(np.array(data_dict['obj_sizes'])) xyz = obj_pcds[:, :, :3] center = xyz.mean(1) xyz_min = xyz.min(1).values xyz_max = xyz.max(1).values box_center = (xyz_min + xyz_max) / 2 size = torch.Tensor(obj_sizes) # size = xyz_max - xyz_min obj_locs = torch.cat([center, size], dim=1) obj_boxes = torch.cat([box_center, size], dim=1) # centering obj_pcds[:, :, :3].sub_(obj_pcds[:, :, :3].mean(1, keepdim=True)) # normalization max_dist = (obj_pcds[:, :, :3]**2).sum(2).sqrt().max(1).values max_dist.clamp_(min=1e-6) obj_pcds[:, :, :3].div_(max_dist[:, None, None]) # convert to torch obj_labels = torch.LongTensor(obj_labels) assert obj_labels.shape[0] == obj_locs.shape[0] return obj_pcds, obj_locs, obj_boxes, obj_labels def _scene_processing_aug(self, obj_pcds, bg_pcds, obj_labels, is_need_bbox=False): obj_len = len(obj_pcds) # sample background points fg_points_num = len(obj_pcds) * self.num_points assert fg_points_num < self.max_pcd_num_points bg_points_num = min(self.max_pcd_num_points - fg_points_num, self.bg_points_num) assert len(bg_pcds) > 0 assert bg_points_num > 0 bg_points_indices = np.random.choice(len(bg_pcds), size=bg_points_num, replace=len(bg_pcds) < bg_points_num) bg_pcds = bg_pcds[bg_points_indices] # augment objects if self.augmentor: data_dict = self.augmentor.forward({'obj_pcds': obj_pcds, 'bg_pcds': torch.Tensor(bg_pcds), 'num_points': self.num_points}) obj_pcds = data_dict['obj_pcds'] if isinstance(obj_pcds, list): obj_pcds = torch.Tensor(np.array(obj_pcds)) obj_sizes = torch.Tensor(np.array(data_dict['obj_sizes'])) bg_pcds = data_dict['bg_pcds'] assert len(obj_pcds) * obj_pcds[0].shape[0] == fg_points_num scene_pcds = np.vstack([np.array(obj_pcds.reshape(-1, 6)), np.array(bg_pcds)]) xyz = obj_pcds[:, :, :3] center = xyz.mean(1) xyz_min = xyz.min(1).values xyz_max = xyz.max(1).values box_center = (xyz_min + xyz_max) / 2 size = torch.Tensor(obj_sizes) # size = xyz_max - xyz_min obj_locs = torch.cat([center, size], dim=1) obj_boxes = torch.cat([box_center, size], dim=1) # centering obj_pcds[:, :, :3].sub_(obj_pcds[:, :, :3].mean(1, keepdim=True)) # normalization max_dist = (obj_pcds[:, :, :3]**2).sum(2).sqrt().max(1).values max_dist.clamp_(min=1e-6) obj_pcds[:, :, :3].div_(max_dist[:, None, None]) # generate obj point indices masks obj_pcds_masks = [] offset = 0 for _j in range(obj_len): mask = np.arange(self.num_points) + offset assert len(mask) == len(obj_pcds[_j]) obj_pcds_masks.append(mask) offset += self.num_points # convert to torch obj_labels = torch.LongTensor(obj_labels) obj_pcds_masks = torch.from_numpy(np.array(obj_pcds_masks)) assert obj_labels.shape[0] == obj_locs.shape[0] assert obj_pcds_masks.shape[0] == obj_locs.shape[0] return obj_locs, obj_boxes, obj_labels, obj_pcds_masks, scene_pcds def _getitem_finalrefer(self, index): item = self.lang_data[index] item_id = item['item_id'] scan_id = item['scan_id'] tgt_object_instance = int(item['target_id']) tgt_object_name = item['instance_type'] sentence = item['utterance'] is_view_dependent = is_explicitly_view_dependent(item['utterance'].split(' ')) txt_ids = item['txt_ids'] txt_masks = item['txt_masks'] if self.use_scene_cap: scene_caps = self.scene_caps.get(scan_id) if scene_caps is not None: scene_cap = copy.deepcopy(scene_caps[np.random.choice(len(scene_caps))]) else: scene_cap = copy.deepcopy(self.default_scene_cap) if self.use_scene_cap: scene_txt_ids = scene_cap['scene_txt_ids'] scene_txt_masks = scene_cap["scene_txt_masks"] scene_txt_ids, scene_txt_masks = merge_tokens( scene_txt_ids, scene_txt_masks, txt_ids, txt_masks, max_len=self.max_scene_cap_len, tokenizer=self.tokenizer ) # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_ids = self.scan_data[scan_id]['inst_ids'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] obj_ids = self.scan_data[scan_id]['inst_ids_pred'] # N assert tgt_object_instance in obj_ids, str(tgt_object_instance) + ' not in ' + str(obj_ids) + '-' + scan_id tgt_object_id = obj_ids.index(tgt_object_instance) # filter out background or language if self.filter_lang: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling']) and (self.int2cat[obj_label] in sentence)] if tgt_object_id not in selected_obj_idxs: selected_obj_idxs.append(tgt_object_id) else: selected_obj_idxs = [i for i in range(len(obj_pcds))] else: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] if tgt_object_id not in selected_obj_idxs: selected_obj_idxs.append(tgt_object_id) else: selected_obj_idxs = [i for i in range(len(obj_pcds))] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # build tgt object id and box if self.pc_type == 'gt': tgt_object_id = selected_obj_idxs.index(tgt_object_id) tgt_object_label = obj_labels[tgt_object_id] tgt_object_id_iou25_list = [tgt_object_id] tgt_object_id_iou50_list = [tgt_object_id] elif self.pc_type == 'pred': gt_pcd = self.scan_data[scan_id]["obj_pcds"][tgt_object_id] gt_center, gt_box_size = convert_pc_to_box(gt_pcd) tgt_object_id = -1 tgt_object_id_iou25_list = [] tgt_object_id_iou50_list = [] tgt_object_label = self.cat2int[tgt_object_name] # find tgt iou 25 for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) >= 0.25: tgt_object_id = i tgt_object_id_iou25_list.append(i) # find tgt iou 50 for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) >= 0.5: tgt_object_id_iou50_list.append(i) assert len(obj_pcds) == len(obj_labels) # crop objects if self.max_obj_len < len(obj_pcds): # select target first if tgt_object_id != -1: selected_obj_idxs = [tgt_object_id] selected_obj_idxs.extend(tgt_object_id_iou25_list) selected_obj_idxs.extend(tgt_object_id_iou50_list) selected_obj_idxs = list(set(selected_obj_idxs)) # select object with same semantic class with tgt_object remained_obj_idx = [] for kobj, klabel in enumerate(obj_labels): if kobj not in selected_obj_idxs: if klabel == tgt_object_label: selected_obj_idxs.append(kobj) else: remained_obj_idx.append(kobj) if len(selected_obj_idxs) == self.max_obj_len: break if len(selected_obj_idxs) < self.max_obj_len: random.shuffle(remained_obj_idx) selected_obj_idxs += remained_obj_idx[:(self.max_obj_len - len(selected_obj_idxs))] # reorganize ids obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] if tgt_object_id != -1: tgt_object_id = selected_obj_idxs.index(tgt_object_id) tgt_object_id_iou25_list = [selected_obj_idxs.index(id) for id in tgt_object_id_iou25_list] tgt_object_id_iou50_list = [selected_obj_idxs.index(id) for id in tgt_object_id_iou50_list] assert len(obj_pcds) == self.max_obj_len # rebuild tgt_object_id if tgt_object_id == -1: tgt_object_id = len(obj_pcds) if not self.load_scene_pcds: if not self.aug_cfg: obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True, rot_aug=self.rot_aug) else: obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_aug(obj_pcds, obj_labels, is_need_bbox=True) else: assert self.aug_cfg if self.pc_type == 'pred': bg_pcds = self.scan_data[scan_id]['bg_pcds_pred'] else: bg_pcds = self.scan_data[scan_id]['bg_pcds'] obj_locs, obj_boxes, obj_labels, obj_pcds_masks, scene_pcds = self._scene_processing_aug(obj_pcds, bg_pcds, obj_labels, is_need_bbox=True) # build iou25 and iou50 tgt_object_id_iou25 = torch.zeros(len(obj_pcds) + 1).long() tgt_object_id_iou50 = torch.zeros(len(obj_pcds) + 1).long() for _id in tgt_object_id_iou25_list: tgt_object_id_iou25[_id] = 1 for _id in tgt_object_id_iou50_list: tgt_object_id_iou50[_id] = 1 # build unique multiple is_multiple = self.scan_data[scan_id]['label_count'][tgt_object_label] > 1 is_hard = self.scan_data[scan_id]['label_count'][tgt_object_label] > 2 data_dict = { "sentence": sentence, "txt_ids": torch.LongTensor(txt_ids), "txt_masks": torch.LongTensor(txt_masks), "tgt_object_id": torch.LongTensor([tgt_object_id]), # 1 "tgt_object_label": torch.LongTensor([tgt_object_label]), # 1 "obj_locs": obj_locs, # N, 3 "obj_labels": obj_labels, # N, "obj_boxes": obj_boxes, # N, 6 "data_idx": item_id, "tgt_object_id_iou25": tgt_object_id_iou25, "tgt_object_id_iou50": tgt_object_id_iou50, 'is_multiple': is_multiple, 'is_view_dependent': is_view_dependent, 'is_hard': is_hard } if self.load_scene_pcds: data_dict["scene_pcds"] = scene_pcds data_dict["obj_pcds_masks"] = obj_pcds_masks else: data_dict["obj_fts"] = obj_fts # N, 6 if self.use_scene_cap: data_dict["scene_cap"] = scene_cap["scene_cap"] data_dict["scene_txt_ids"] = torch.LongTensor(scene_txt_ids) data_dict["scene_txt_masks"] = torch.LongTensor(scene_txt_masks) return data_dict ================================================ FILE: data/datasets/constant.py ================================================ ### ScanNet200 Benchmark constants ### VALID_CLASS_IDS_200 = ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208, 213, 214, 221, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 286, 300, 304, 312, 323, 325, 331, 342, 356, 370, 392, 395, 399, 408, 417, 488, 540, 562, 570, 572, 581, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, ) CLASS_LABELS_200 = ( "wall", "chair", "floor", "table", "door", "couch", "cabinet", "shelf", "desk", "office chair", "bed", "pillow", "sink", "picture", "window", "toilet", "bookshelf", "monitor", "curtain", "book", "armchair", "coffee table", "box", "refrigerator", "lamp", "kitchen cabinet", "towel", "clothes", "tv", "nightstand", "counter", "dresser", "stool", "cushion", "plant", "ceiling", "bathtub", "end table", "dining table", "keyboard", "bag", "backpack", "toilet paper", "printer", "tv stand", "whiteboard", "blanket", "shower curtain", "trash can", "closet", "stairs", "microwave", "stove", "shoe", "computer tower", "bottle", "bin", "ottoman", "bench", "board", "washing machine", "mirror", "copier", "basket", "sofa chair", "file cabinet", "fan", "laptop", "shower", "paper", "person", "paper towel dispenser", "oven", "blinds", "rack", "plate", "blackboard", "piano", "suitcase", "rail", "radiator", "recycling bin", "container", "wardrobe", "soap dispenser", "telephone", "bucket", "clock", "stand", "light", "laundry basket", "pipe", "clothes dryer", "guitar", "toilet paper holder", "seat", "speaker", "column", "bicycle", "ladder", "bathroom stall", "shower wall", "cup", "jacket", "storage bin", "coffee maker", "dishwasher", "paper towel roll", "machine", "mat", "windowsill", "bar", "toaster", "bulletin board", "ironing board", "fireplace", "soap dish", "kitchen counter", "doorframe", "toilet paper dispenser", "mini fridge", "fire extinguisher", "ball", "hat", "shower curtain rod", "water cooler", "paper cutter", "tray", "shower door", "pillar", "ledge", "toaster oven", "mouse", "toilet seat cover dispenser", "furniture", "cart", "storage container", "scale", "tissue box", "light switch", "crate", "power outlet", "decoration", "sign", "projector", "closet door", "vacuum cleaner", "candle", "plunger", "stuffed animal", "headphones", "dish rack", "broom", "guitar case", "range hood", "dustpan", "hair dryer", "water bottle", "handicap bar", "purse", "vent", "shower floor", "water pitcher", "mailbox", "bowl", "paper bag", "alarm clock", "music stand", "projector screen", "divider", "laundry detergent", "bathroom counter", "object", "bathroom vanity", "closet wall", "laundry hamper", "bathroom stall door", "ceiling light", "trash bin", "dumbbell", "stair rail", "tube", "bathroom cabinet", "cd case", "closet rod", "coffee kettle", "structure", "shower head", "keyboard piano", "case of water bottles", "coat rack", "storage organizer", "folded chair", "fire alarm", "power strip", "calendar", "poster", "potted plant", "luggage", "mattress", ) SCANNET_COLOR_MAP_200 = { 0: (0.0, 0.0, 0.0), 1: (174.0, 199.0, 232.0), 2: (188.0, 189.0, 34.0), 3: (152.0, 223.0, 138.0), 4: (255.0, 152.0, 150.0), 5: (214.0, 39.0, 40.0), 6: (91.0, 135.0, 229.0), 7: (31.0, 119.0, 180.0), 8: (229.0, 91.0, 104.0), 9: (247.0, 182.0, 210.0), 10: (91.0, 229.0, 110.0), 11: (255.0, 187.0, 120.0), 13: (141.0, 91.0, 229.0), 14: (112.0, 128.0, 144.0), 15: (196.0, 156.0, 148.0), 16: (197.0, 176.0, 213.0), 17: (44.0, 160.0, 44.0), 18: (148.0, 103.0, 189.0), 19: (229.0, 91.0, 223.0), 21: (219.0, 219.0, 141.0), 22: (192.0, 229.0, 91.0), 23: (88.0, 218.0, 137.0), 24: (58.0, 98.0, 137.0), 26: (177.0, 82.0, 239.0), 27: (255.0, 127.0, 14.0), 28: (237.0, 204.0, 37.0), 29: (41.0, 206.0, 32.0), 31: (62.0, 143.0, 148.0), 32: (34.0, 14.0, 130.0), 33: (143.0, 45.0, 115.0), 34: (137.0, 63.0, 14.0), 35: (23.0, 190.0, 207.0), 36: (16.0, 212.0, 139.0), 38: (90.0, 119.0, 201.0), 39: (125.0, 30.0, 141.0), 40: (150.0, 53.0, 56.0), 41: (186.0, 197.0, 62.0), 42: (227.0, 119.0, 194.0), 44: (38.0, 100.0, 128.0), 45: (120.0, 31.0, 243.0), 46: (154.0, 59.0, 103.0), 47: (169.0, 137.0, 78.0), 48: (143.0, 245.0, 111.0), 49: (37.0, 230.0, 205.0), 50: (14.0, 16.0, 155.0), 51: (196.0, 51.0, 182.0), 52: (237.0, 80.0, 38.0), 54: (138.0, 175.0, 62.0), 55: (158.0, 218.0, 229.0), 56: (38.0, 96.0, 167.0), 57: (190.0, 77.0, 246.0), 58: (208.0, 49.0, 84.0), 59: (208.0, 193.0, 72.0), 62: (55.0, 220.0, 57.0), 63: (10.0, 125.0, 140.0), 64: (76.0, 38.0, 202.0), 65: (191.0, 28.0, 135.0), 66: (211.0, 120.0, 42.0), 67: (118.0, 174.0, 76.0), 68: (17.0, 242.0, 171.0), 69: (20.0, 65.0, 247.0), 70: (208.0, 61.0, 222.0), 71: (162.0, 62.0, 60.0), 72: (210.0, 235.0, 62.0), 73: (45.0, 152.0, 72.0), 74: (35.0, 107.0, 149.0), 75: (160.0, 89.0, 237.0), 76: (227.0, 56.0, 125.0), 77: (169.0, 143.0, 81.0), 78: (42.0, 143.0, 20.0), 79: (25.0, 160.0, 151.0), 80: (82.0, 75.0, 227.0), 82: (253.0, 59.0, 222.0), 84: (240.0, 130.0, 89.0), 86: (123.0, 172.0, 47.0), 87: (71.0, 194.0, 133.0), 88: (24.0, 94.0, 205.0), 89: (134.0, 16.0, 179.0), 90: (159.0, 32.0, 52.0), 93: (213.0, 208.0, 88.0), 95: (64.0, 158.0, 70.0), 96: (18.0, 163.0, 194.0), 97: (65.0, 29.0, 153.0), 98: (177.0, 10.0, 109.0), 99: (152.0, 83.0, 7.0), 100: (83.0, 175.0, 30.0), 101: (18.0, 199.0, 153.0), 102: (61.0, 81.0, 208.0), 103: (213.0, 85.0, 216.0), 104: (170.0, 53.0, 42.0), 105: (161.0, 192.0, 38.0), 106: (23.0, 241.0, 91.0), 107: (12.0, 103.0, 170.0), 110: (151.0, 41.0, 245.0), 112: (133.0, 51.0, 80.0), 115: (184.0, 162.0, 91.0), 116: (50.0, 138.0, 38.0), 118: (31.0, 237.0, 236.0), 120: (39.0, 19.0, 208.0), 121: (223.0, 27.0, 180.0), 122: (254.0, 141.0, 85.0), 125: (97.0, 144.0, 39.0), 128: (106.0, 231.0, 176.0), 130: (12.0, 61.0, 162.0), 131: (124.0, 66.0, 140.0), 132: (137.0, 66.0, 73.0), 134: (250.0, 253.0, 26.0), 136: (55.0, 191.0, 73.0), 138: (60.0, 126.0, 146.0), 139: (153.0, 108.0, 234.0), 140: (184.0, 58.0, 125.0), 141: (135.0, 84.0, 14.0), 145: (139.0, 248.0, 91.0), 148: (53.0, 200.0, 172.0), 154: (63.0, 69.0, 134.0), 155: (190.0, 75.0, 186.0), 156: (127.0, 63.0, 52.0), 157: (141.0, 182.0, 25.0), 159: (56.0, 144.0, 89.0), 161: (64.0, 160.0, 250.0), 163: (182.0, 86.0, 245.0), 165: (139.0, 18.0, 53.0), 166: (134.0, 120.0, 54.0), 168: (49.0, 165.0, 42.0), 169: (51.0, 128.0, 133.0), 170: (44.0, 21.0, 163.0), 177: (232.0, 93.0, 193.0), 180: (176.0, 102.0, 54.0), 185: (116.0, 217.0, 17.0), 188: (54.0, 209.0, 150.0), 191: (60.0, 99.0, 204.0), 193: (129.0, 43.0, 144.0), 195: (252.0, 100.0, 106.0), 202: (187.0, 196.0, 73.0), 208: (13.0, 158.0, 40.0), 213: (52.0, 122.0, 152.0), 214: (128.0, 76.0, 202.0), 221: (187.0, 50.0, 115.0), 229: (180.0, 141.0, 71.0), 230: (77.0, 208.0, 35.0), 232: (72.0, 183.0, 168.0), 233: (97.0, 99.0, 203.0), 242: (172.0, 22.0, 158.0), 250: (155.0, 64.0, 40.0), 261: (118.0, 159.0, 30.0), 264: (69.0, 252.0, 148.0), 276: (45.0, 103.0, 173.0), 283: (111.0, 38.0, 149.0), 286: (184.0, 9.0, 49.0), 300: (188.0, 174.0, 67.0), 304: (53.0, 206.0, 53.0), 312: (97.0, 235.0, 252.0), 323: (66.0, 32.0, 182.0), 325: (236.0, 114.0, 195.0), 331: (241.0, 154.0, 83.0), 342: (133.0, 240.0, 52.0), 356: (16.0, 205.0, 144.0), 370: (75.0, 101.0, 198.0), 392: (237.0, 95.0, 251.0), 395: (191.0, 52.0, 49.0), 399: (227.0, 254.0, 54.0), 408: (49.0, 206.0, 87.0), 417: (48.0, 113.0, 150.0), 488: (125.0, 73.0, 182.0), 540: (229.0, 32.0, 114.0), 562: (158.0, 119.0, 28.0), 570: (60.0, 205.0, 27.0), 572: (18.0, 215.0, 201.0), 581: (79.0, 76.0, 153.0), 609: (134.0, 13.0, 116.0), 748: (192.0, 97.0, 63.0), 776: (108.0, 163.0, 18.0), 1156: (95.0, 220.0, 156.0), 1163: (98.0, 141.0, 208.0), 1164: (144.0, 19.0, 193.0), 1165: (166.0, 36.0, 57.0), 1166: (212.0, 202.0, 34.0), 1167: (23.0, 206.0, 34.0), 1168: (91.0, 211.0, 236.0), 1169: (79.0, 55.0, 137.0), 1170: (182.0, 19.0, 117.0), 1171: (134.0, 76.0, 14.0), 1172: (87.0, 185.0, 28.0), 1173: (82.0, 224.0, 187.0), 1174: (92.0, 110.0, 214.0), 1175: (168.0, 80.0, 171.0), 1176: (197.0, 63.0, 51.0), 1178: (175.0, 199.0, 77.0), 1179: (62.0, 180.0, 98.0), 1180: (8.0, 91.0, 150.0), 1181: (77.0, 15.0, 130.0), 1182: (154.0, 65.0, 96.0), 1183: (197.0, 152.0, 11.0), 1184: (59.0, 155.0, 45.0), 1185: (12.0, 147.0, 145.0), 1186: (54.0, 35.0, 219.0), 1187: (210.0, 73.0, 181.0), 1188: (221.0, 124.0, 77.0), 1189: (149.0, 214.0, 66.0), 1190: (72.0, 185.0, 134.0), 1191: (42.0, 94.0, 198.0), } HEAD_CATS_SCANNET_200 = ['tv stand', 'curtain', 'blinds', 'shower curtain', 'bookshelf', 'tv', 'kitchen cabinet', 'pillow', 'lamp', 'dresser', 'monitor', 'object', 'ceiling', 'board', 'stove', 'closet wall', 'couch', 'office chair', 'kitchen counter', 'shower', 'closet', 'doorframe', 'sofa chair', 'mailbox', 'nightstand', 'washing machine', 'picture', 'book', 'sink', 'recycling bin', 'table', 'backpack', 'shower wall', 'toilet', 'copier', 'counter', 'stool', 'refrigerator', 'window', 'file cabinet', 'chair', 'wall', 'plant', 'coffee table', 'stairs', 'armchair', 'cabinet', 'bathroom vanity', 'bathroom stall', 'mirror', 'blackboard', 'trash can', 'stair rail', 'box', 'towel', 'door', 'clothes', 'whiteboard', 'bed', 'floor', 'bathtub', 'desk', 'wardrobe', 'clothes dryer', 'radiator', 'shelf'] COMMON_CATS_SCANNET_200 = ["cushion", "end table", "dining table", "keyboard", "bag", "toilet paper", "printer", "blanket", "microwave", "shoe", "computer tower", "bottle", "bin", "ottoman", "bench", "basket", "fan", "laptop", "person", "paper towel dispenser", "oven", "rack", "piano", "suitcase", "rail", "container", "telephone", "stand", "light", "laundry basket", "pipe", "seat", "column", "bicycle", "ladder", "jacket", "storage bin", "coffee maker", "dishwasher", "machine", "mat", "windowsill", "bulletin board", "fireplace", "mini fridge", "water cooler", "shower door", "pillar", "ledge", "furniture", "cart", "decoration", "closet door", "vacuum cleaner", "dish rack", "range hood", "projector screen", "divider", "bathroom counter", "laundry hamper", "bathroom stall door", "ceiling light", "trash bin", "bathroom cabinet", "structure", "storage organizer", "potted plant", "mattress"] TAIL_CATS_SCANNET_200 = ["paper", "plate", "soap dispenser", "bucket", "clock", "guitar", "toilet paper holder", "speaker", "cup", "paper towel roll", "bar", "toaster", "ironing board", "soap dish", "toilet paper dispenser", "fire extinguisher", "ball", "hat", "shower curtain rod", "paper cutter", "tray", "toaster oven", "mouse", "toilet seat cover dispenser", "storage container", "scale", "tissue box", "light switch", "crate", "power outlet", "sign", "projector", "candle", "plunger", "stuffed animal", "headphones", "broom", "guitar case", "dustpan", "hair dryer", "water bottle", "handicap bar", "purse", "vent", "shower floor", "water pitcher", "bowl", "paper bag", "alarm clock", "music stand", "laundry detergent", "dumbbell", "tube", "cd case", "closet rod", "coffee kettle", "shower head", "keyboard piano", "case of water bottles", "coat rack", "folded chair", "fire alarm", "power strip", "calendar", "poster", "luggage"] VALID_CLASS_IDS_200_VALIDATION = ('wall', 'chair', 'floor', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow', 'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack', 'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', 'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine', 'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person', 'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator', 'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light', 'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher', 'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board', 'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge', 'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door', 'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door', 'vacuum cleaner', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'vent', 'shower floor', 'water pitcher', 'mailbox', 'bowl', 'paper bag', 'projector screen', 'divider', 'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper', 'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', 'closet rod', 'coffee kettle', 'shower head', 'keyboard piano', 'case of water bottles', 'coat rack', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant', 'mattress') ================================================ FILE: data/datasets/data_augmentor.py ================================================ from functools import partial import math import numpy as np import torch class DataAugmentor(object): def __init__(self, cfg, split, **kwargs): self.data_augmentor_queue = [] self.aug_cfg = cfg self.kwargs = kwargs aug_config_list = self.aug_cfg.aug_list self.data_augmentor_queue = [] if split == 'train': for aug in aug_config_list: if aug not in self.aug_cfg: continue cur_augmentor = partial(getattr(self, aug), config=self.aug_cfg[aug]) self.data_augmentor_queue.append(cur_augmentor) def forward(self, data_dict): """ Args: data_dict: obj_pcds: (N, 3 + C_in) num_points: (1,) ... Returns: """ aug_dict = self.init_aug(len(data_dict['obj_pcds'])) for cur_augmentor in self.data_augmentor_queue: aug_dict = cur_augmentor(aug_dict=aug_dict) data_dict = self.update_data_dict(data_dict, aug_dict) return data_dict def scene_aug(self, aug_dict, config): # scene translation if self.check_key(config.translation) and self.check_p(config.translation): n = np.zeros((3)) for i in range(3): n[i] = np.random.randn() * config.translation.value[i] aug_dict['scene_trans'] = n # scene scaling if self.check_key(config.scaling) and self.check_p(config.scaling): scaling_fac = np.random.rand() \ * (config.scaling.value[1] - config.scaling.value[0]) \ + config.scaling.value[0] aug_dict['scene_scale'] = scaling_fac # scene flip if self.check_key(config.flip) and self.check_p(config.flip): m = np.eye(3) flip_type = np.random.choice(4, 1) if flip_type == 0: # flip x only m[0][0] *= -1 elif flip_type == 1: # flip y only m[1][1] *= -1 elif flip_type == 2: # flip x+y m[0][0] *= -1 m[1][1] *= -1 aug_dict['scene_flip'] = m # scene rotation if self.check_key(config.rotation) and self.check_p(config.rotation): if config.rotation.axis_align: _r_angles = [0, math.pi/2, math.pi, math.pi*3/2] theta_x = np.random.choice(_r_angles) * config.rotation.value[0] theta_y = np.random.choice(_r_angles) * config.rotation.value[1] theta_z = np.random.choice(_r_angles) * config.rotation.value[2] else: theta_x = (np.random.rand() * 2 * math.pi - math.pi) * config.rotation.value[0] theta_y = (np.random.rand() * 2 * math.pi - math.pi) * config.rotation.value[1] theta_z = (np.random.rand() * 2 * math.pi - math.pi) * config.rotation.value[2] rx = np.array \ ([[1, 0, 0], [0, math.cos(theta_x), -math.sin(theta_x)], [0, math.sin(theta_x), math.cos(theta_x)]]) ry = np.array \ ([[math.cos(theta_y), 0, math.sin(theta_y)], [0, 1, 0], [-math.sin(theta_y), 0, math.cos(theta_y)]]) rz = np.array \ ([[math.cos(theta_z), math.sin(theta_z), 0], [-math.sin(theta_z), math.cos(theta_z), 0], [0, 0, 1]]) rot_mats = [rx, ry, rz] if config.rotation.get('shuffle', False): np.random.shuffle(rot_mats) r = rot_mats[0].dot(rot_mats[1]).dot(rot_mats[2]) aug_dict['scene_rot'] = r # scene color jitter if self.check_key(config.color_jitter): rgb_delta = np.random.randn(3) * 0.1 aug_dict['rgb_delta'] = rgb_delta # scene order suffle if self.check_key(config.order_shuffle): aug_dict['obj_order'] = np.random.permutation(len(aug_dict['obj_order'])) return aug_dict def obj_aug(self, aug_dict, config): obj_len = len(aug_dict['obj_order']) obj_trans = [] obj_rot = [] for _ in range(obj_len): n = None r = None # object translation if self.check_key(config.translation) and self.check_p(config.translation): n = np.zeros((3)) for i in range(3): n[i] = np.random.randn() * config.translation.value[i] obj_trans.append(n) # object rotation if self.check_key(config.rotation) and self.check_p(config.rotation): if config.rotation.axis_align: _r_angles = [0, math.pi/2, math.pi, math.pi*3/2] theta_x = np.random.choice(_r_angles) * config.rotation.value[0] theta_y = np.random.choice(_r_angles) * config.rotation.value[1] theta_z = np.random.choice(_r_angles) * config.rotation.value[2] else: theta_x = (np.random.rand() * 2 * math.pi - math.pi) * config.rotation.value[0] theta_y = (np.random.rand() * 2 * math.pi - math.pi) * config.rotation.value[1] theta_z = (np.random.rand() * 2 * math.pi - math.pi) * config.rotation.value[2] rx = np.array \ ([[1, 0, 0], [0, math.cos(theta_x), -math.sin(theta_x)], [0, math.sin(theta_x), math.cos(theta_x)]]) ry = np.array \ ([[math.cos(theta_y), 0, math.sin(theta_y)], [0, 1, 0], [-math.sin(theta_y), 0, math.cos(theta_y)]]) rz = np.array \ ([[math.cos(theta_z), math.sin(theta_z), 0], [-math.sin(theta_z), math.cos(theta_z), 0], [0, 0, 1]]) rot_mats = [rx, ry, rz] if config.rotation.get('shuffle', False): np.random.shuffle(rot_mats) r = rot_mats[0].dot(rot_mats[1]).dot(rot_mats[2]) obj_rot.append(r) aug_dict['obj_trans'] = obj_trans aug_dict['obj_rot'] = obj_rot # object jitter if self.check_key(config.random_jitter): aug_dict['obj_jitter'] = config.random_jitter.value # object pts shuffle if self.check_key(config.pts_shuffle): aug_dict['pts_shuffle'] = True return aug_dict def update_data_dict(self, data_dict, aug_dict): data_dict['obj_sizes'] = [] for i, _ in enumerate(data_dict['obj_pcds']): # scene flip if aug_dict['scene_flip'] is not None: data_dict['obj_pcds'][i][:, :3] = self.rot_fn(data_dict['obj_pcds'][i][:, :3], aug_dict['scene_flip']) # scene scaling if aug_dict['scene_scale'] is not None: data_dict['obj_pcds'][i][:, :3] = self.scaling_fn(data_dict['obj_pcds'][i][:, :3], aug_dict['scene_scale']) # subsample data_dict['obj_pcds'][i] = self.subsample_fn(data_dict['obj_pcds'][i], data_dict['num_points']) # jitter if aug_dict['obj_jitter'] is not None: data_dict['obj_pcds'][i][:, :3] = self.jitter_fn(data_dict['obj_pcds'][i][:, :3], aug_dict['obj_jitter']) # calc obj size data_dict['obj_sizes'].append(data_dict['obj_pcds'][i][:, :3].max(0) - data_dict['obj_pcds'][i][:, :3].min(0)) # scene translation if aug_dict['scene_trans'] is not None: data_dict['obj_pcds'][i][:, :3] += aug_dict['scene_trans'] # obj translation if aug_dict['obj_trans'] and aug_dict['obj_trans'][i] is not None: data_dict['obj_pcds'][i][:, :3] += aug_dict['obj_trans'][i] # # scene rotation # if aug_dict['scene_rot'] is not None: # data_dict['obj_pcds'][i][:, :3] = self.rot_fn(data_dict['obj_pcds'][i][:, :3], # aug_dict['scene_rot']) # if 'bg_pcds' in data_dict.keys(): # data_dict['bg_pcds'][:, :3] = self.rot_fn(data_dict['bg_pcds'][:, :3], # aug_dict['scene_rot']) # scene rotation if aug_dict['scene_rot'] is not None: data_dict['obj_pcds'] = torch.Tensor(np.array(data_dict['obj_pcds'])) data_dict['obj_pcds'][:, :, :3] @= aug_dict['scene_rot'] # data_dict['obj_pcds'][:, :3] = self.rot_fn(data_dict['obj_pcds'][i][:, :3], # aug_dict['scene_rot']) if 'bg_pcds' in data_dict.keys(): data_dict['bg_pcds'][:, :3] @= aug_dict['scene_rot'] # ['scene_rot']= self.rot_fn(data_dict['bg_pcds'][:, :3], # aug_dict['scene_rot']) for i, _ in enumerate(data_dict['obj_pcds']): # obj rotation if aug_dict['obj_rot'] and aug_dict['obj_rot'][i] is not None: data_dict['obj_pcds'][i][:, :3] = self.obj_rot_fn(data_dict['obj_pcds'][i][:, :3], aug_dict['obj_rot'][i]) # scene color jitter if aug_dict['rgb_delta'] is not None: data_dict['obj_pcds'][i][:, 3:] += aug_dict['rgb_delta'] # pts shuffle if aug_dict['pts_shuffle']: data_dict['obj_pcds'][i] = self.pts_shuffle_fn(data_dict['obj_pcds'][i]) # object order data_dict['obj_order'] = aug_dict['obj_order'] return data_dict @staticmethod def init_aug(obj_len): keys = ['scene_trans', 'scene_flip', 'scene_rot', 'scene_scale', 'rgb_delta', 'obj_trans', 'obj_rot', 'obj_jitter', 'pts_shuffle'] aug_dict = {key: None for key in keys} aug_dict['obj_order'] = list(np.arange(obj_len)) return aug_dict @staticmethod def check_key(key): exist = key is not None if not exist: return False if isinstance(key, bool): enabled = key elif isinstance(key, dict): enabled = key.get('enabled', True) elif hasattr(key, 'enabled'): enabled = key.get('enabled') else: enabled = True return enabled @staticmethod def check_p(key): return (not isinstance(key, dict)) or ('p' not in key) or (np.random.rand() < key['p']) @staticmethod def rot_fn(x, mat): return np.matmul(x, mat) @staticmethod def obj_rot_fn(x, mat): center = x.mean(0) return np.matmul(x - center, mat) + center @staticmethod def scaling_fn(x, scale): center = x.mean(0) return (x - center) * scale + center @staticmethod def jitter_fn(x, scale): return x + (np.random.randn(len(x), 3) - 0.5) * scale @staticmethod def subsample_fn(x, num_points): pcd_idxs = np.random.choice(len(x), size=num_points, replace=len(x) < num_points) return x[pcd_idxs] @staticmethod def pts_shuffle_fn(x): return x[np.random.permutation(len(x))] ================================================ FILE: data/datasets/dataset_wrapper.py ================================================ import random import torch from fvcore.common.registry import Registry from transformers import BertTokenizer, T5Tokenizer, AutoTokenizer from torch.utils.data import Dataset, default_collate from ..data_utils import random_word, random_point_cloud, pad_tensors, Vocabulary, random_caption_word DATASETWRAPPER_REGISTRY = Registry("dataset_wrapper") DATASETWRAPPER_REGISTRY.__doc__ = """ """ @DATASETWRAPPER_REGISTRY.register() class MaskDatasetWrapper(Dataset): def __init__(self, cfg, dataset, split="train"): # tokenizer, max_seq_length=80, max_obj_len=80, # mask_strategy='random', txt_mask_ratio=0.15, pc_mask_ratio=0.1 assert cfg.data.args.mask_strategy in ['random'] self.dataset = dataset self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) self.max_seq_length = cfg.data.args.max_seq_len self.max_obj_len = cfg.data.args.max_obj_len self.txt_mask_ratio = cfg.data.args.txt_mask_ratio self.pc_mask_ratio = cfg.data.args.pc_mask_ratio self.use_voxel = cfg.data.args.get('use_voxel', None) if self.use_voxel: self.voxel_size = cfg.data.args.get('voxel_size', 0.02) self.use_scene_cap = cfg.data.args.get("use_scene_cap", False) self.max_scene_cap_len = cfg.data.args.get("max_scene_cap_len", self.max_seq_length) def __len__(self): return len(self.dataset) def __getitem__(self, idx): data_dict = self.dataset[idx] sentence = data_dict['sentence'] encoded_input = self.tokenizer(sentence, max_length=self.max_seq_length, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt") # build txt data_dict['txt_ids'] = encoded_input['input_ids'].squeeze(0) # L data_dict['txt_masks'] = encoded_input['attention_mask'].squeeze(0) # L if self.use_scene_cap: scene_cap = data_dict["scene_cap"] + " " + sentence encoded_scene_cap = self.tokenizer(scene_cap, max_length=self.max_scene_cap_len, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt") data_dict['scene_txt_ids'] = encoded_scene_cap['input_ids'].squeeze(0) # L data_dict['scene_txt_masks'] = encoded_scene_cap['attention_mask'].squeeze(0) # L # mask txt masked_txt_ids, masked_lm_labels = random_word(data_dict['txt_ids'], data_dict['txt_masks'], self.tokenizer, self.txt_mask_ratio) data_dict['txt_ids'] = masked_txt_ids data_dict['masked_lm_labels'] = masked_lm_labels # build object data_dict['obj_masks'] = (torch.arange(self.max_obj_len) < len(data_dict['obj_locs'])) # O if 'obj_fts' in data_dict.keys(): data_dict['obj_fts'] = pad_tensors(data_dict['obj_fts'], lens=self.max_obj_len, pad=1.0).float() # O, 1024, 6 if 'obj_pcds_masks' in data_dict.keys(): data_dict['obj_pcds_masks'] = pad_tensors(data_dict['obj_pcds_masks'], lens=self.max_obj_len, pad=1.0).float() data_dict['obj_locs']= pad_tensors(data_dict['obj_locs'], lens=self.max_obj_len, pad=0.0).float() # O, 3 data_dict['obj_labels'] = pad_tensors(data_dict['obj_labels'], lens=self.max_obj_len, pad=-100).long() # O # mask object, 0 means mask object, 1 means keep object if 'obj_fts' in data_dict.keys(): obj_sem_masks = random_point_cloud(data_dict['obj_fts'], data_dict['obj_masks'], self.pc_mask_ratio) data_dict['obj_sem_masks'] = obj_sem_masks else: obj_sem_masks = [] for i in range(self.max_obj_len): if i >= len(data_dict['obj_locs']): obj_sem_masks.append(0) else: prob = random.random() if prob < self.pc_mask_ratio: obj_sem_masks.append(0) else: obj_sem_masks.append(1) data_dict['obj_sem_masks'] = torch.tensor(obj_sem_masks).long() if 'tgt_object_id' in data_dict.keys(): data_dict['tgt_object_id'] = data_dict['tgt_object_id'].long() # 1 or O # # Scene pcds # data_dict["scene_pcds"] = torch.from_numpy(data_dict["scene_pcds"]).float() key_list = [ 'txt_ids', 'txt_masks', 'masked_lm_labels', 'obj_masks', 'obj_fts', 'obj_locs', 'obj_labels', 'obj_sem_masks', 'tgt_object_id' ] if 'obj_fts' not in data_dict.keys(): key_list.remove('obj_fts') # key_list.remove('obj_sem_masks') if 'obj_pcds_masks' in data_dict.keys(): key_list.append('obj_pcds_masks') if 'scene_pcds' in data_dict.keys(): key_list.append('scene_pcds') if 'scene_txt_ids' in data_dict.keys(): key_list.append('scene_txt_ids') if 'scene_txt_masks' in data_dict.keys(): key_list.append('scene_txt_masks') data_dict = {k : v for k, v in data_dict.items() if k in key_list} return data_dict def collate_fn(self, batch_list): ret = default_collate(batch_list) return ret @DATASETWRAPPER_REGISTRY.register() class ScanFamilyDatasetWrapperOld(Dataset): def __init__(self, cfg, dataset, split="train"): # stokenizer, max_seq_length=80, max_obj_len=80 self.dataset = dataset self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) self.max_seq_length = cfg.data.args.max_seq_len self.max_obj_len = cfg.data.args.max_obj_len self.use_voxel = cfg.data.args.get('use_voxel', None) if self.use_voxel: self.voxel_size = cfg.data.args.get('voxel_size', 0.02) self.use_scene_cap = cfg.data.args.get("use_scene_cap", False) self.max_scene_cap_len = cfg.data.args.get("max_scene_cap_len", self.max_seq_length) def __len__(self): return len(self.dataset) def pad_tensors(self, tensors, lens=None, pad=0): assert tensors.shape[0] <= lens if tensors.shape[0] == lens: return tensors shape = list(tensors.shape) shape[0] = lens - shape[0] res = torch.ones(shape, dtype=tensors.dtype) * pad res = torch.cat((tensors, res), dim=0) return res def __getitem__(self, idx): data_dict = self.dataset[idx] sentence = data_dict['sentence'] encoded_input = self.tokenizer(sentence, max_length=self.max_seq_length, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt") # build txt data_dict['txt_ids'] = encoded_input['input_ids'].squeeze(0) # L data_dict['txt_masks'] = encoded_input['attention_mask'].squeeze(0) # L if self.use_scene_cap: scene_cap = data_dict["scene_cap"] + " " + sentence encoded_scene_cap = self.tokenizer(scene_cap, max_length=self.max_scene_cap_len, add_special_tokens=True, truncation=True, padding='max_length', return_tensors="pt") data_dict['scene_txt_ids'] = encoded_scene_cap['input_ids'].squeeze(0) # L data_dict['scene_txt_masks'] = encoded_scene_cap['attention_mask'].squeeze(0) # L # build object data_dict['obj_masks'] = (torch.arange(self.max_obj_len) < len(data_dict['obj_locs'])) # O if 'obj_fts' in data_dict.keys(): data_dict['obj_fts'] = self.pad_tensors(data_dict['obj_fts'], lens=self.max_obj_len, pad=1.0).float() # O, 1024, 6 if 'obj_pcds_masks' in data_dict.keys(): data_dict['obj_pcds_masks'] = pad_tensors(data_dict['obj_pcds_masks'], lens=self.max_obj_len, pad=1.0).float() data_dict['obj_locs']= self.pad_tensors(data_dict['obj_locs'], lens=self.max_obj_len, pad=0.0).float() # O, 3 data_dict['obj_boxes']= self.pad_tensors(data_dict['obj_boxes'], lens=self.max_obj_len, pad=0.0).float() # O, 3 data_dict['obj_labels'] = self.pad_tensors(data_dict['obj_labels'], lens=self.max_obj_len, pad=-100).long() # O # build sem mask, no mask data_dict['obj_sem_masks'] = (torch.arange(self.max_obj_len) < len(data_dict['obj_locs'])) # build label for refer data_dict['tgt_object_label'] = data_dict['tgt_object_label'].long() # 1 or C data_dict['tgt_object_id'] = data_dict['tgt_object_id'].long() # 1 or O if len(data_dict['tgt_object_id']) > 1: # O, pad to max objet length data_dict['tgt_object_id'] = self.pad_tensors(data_dict['tgt_object_id'].long(), lens=self.max_obj_len, pad=0).long() # O # build target if data_dict.get('tgt_object_id_iou25') is not None: data_dict['tgt_object_id_iou25'] = self.pad_tensors(data_dict['tgt_object_id_iou25'], lens=self.max_obj_len, pad=0).long() if data_dict.get('tgt_object_id_iou50') is not None: data_dict['tgt_object_id_iou50'] = self.pad_tensors(data_dict['tgt_object_id_iou50'], lens=self.max_obj_len, pad=0).long() # build label for qa if "answer_label" in data_dict: data_dict['answer_label'] = data_dict['answer_label'].long() # N, C return data_dict def collate_fn(self, batch_list): ret = default_collate(batch_list) return ret @DATASETWRAPPER_REGISTRY.register() class VisualizeDatasetWrapper(Dataset): def __init__(self, cfg, dataset, split="train"): self.dataset = dataset def __len__(self): return len(self.dataset) def __getitem__(self, idx): data_dict = self.dataset[idx] ret = {} ret['scene_pcds'] = data_dict['scene_pcds'] ret['item_id'] = data_dict['data_idx'] return ret def collate_fn(self, batch_list): ret = {} ret['scene_pcds'] = [b['scene_pcds'] for b in batch_list] ret['item_id'] = [b['item_id'] for b in batch_list] return ret ================================================ FILE: data/datasets/hm.py ================================================ import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class HMPretrainObj(ScanBase): def __init__(self, cfg, split): super(HMPretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.hm_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading HM3D {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading HM3D {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'hm3d' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class HMSpatialRefer(ScanBase): def __init__(self, cfg, split): super(HMSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.hm_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading HM3D {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading HM3D {split}-set language of size {self.__len__()}") print(f"Loading HM3D {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading HM3D {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: data/datasets/multiscan.py ================================================ import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class MultiScanPretrainObj(ScanBase): def __init__(self, cfg, split): super(MultiScanPretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.multiscan_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading MultiScan {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading MultiScan {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'multiscan' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class MultiScanSpatialRefer(ScanBase): def __init__(self, cfg, split): super(MultiScanSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.multiscan_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading MultiScan {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading MultiScan {split}-set language of size {self.__len__()}") print(f"Loading Multiscan {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading MultiScan {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: data/datasets/procthor.py ================================================ import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class ProcThorPretrainObj(ScanBase): def __init__(self, cfg, split): super(ProcThorPretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.procthor_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading ProcThor {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading ProcThor {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'procthor' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class ProcThorSpatialRefer(ScanBase): def __init__(self, cfg, split): super(ProcThorSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.procthor_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading ProcThor {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading ProcThor {split}-set language of size {self.__len__()}") print(f"Loading ProcThor {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading ProcThor {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: data/datasets/rscan.py ================================================ import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class RScanPretrainObj(ScanBase): def __init__(self, cfg, split): super(RScanPretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.rscan_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading 3RScan {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading 3RScan {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'rscan' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class RScanSpatialRefer(ScanBase): def __init__(self, cfg, split): super(RScanSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.rscan_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading 3RScanSpatialRefer {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading 3RScanSpatialRefer {split}-set language of size {self.__len__()}") print(f"Loading 3RScan {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading 3RScan {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: data/datasets/scannet.py ================================================ import os import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class ScanNetPretrainObj(ScanBase): def __init__(self, cfg, split): super(ScanNetPretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.scan_family_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading 3RScan {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading 3RScan {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'rscan' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class ScanNetSpatialRefer(ScanBase): def __init__(self, cfg, split): super(ScanNetSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.scan_family_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading ScanNetSpatialRefer {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading ScanNetSpatialRefer {split}-set language of size {self.__len__()}") print(f"Loading ScanNet {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading ScanNet {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: data/datasets/scannet_base.py ================================================ import os import collections import json import pickle import random import jsonlines from tqdm import tqdm from scipy import sparse import numpy as np import torch from torch.utils.data import Dataset from common.misc import rgetattr from ..data_utils import convert_pc_to_box, LabelConverter, build_rotate_mat, load_matrix_from_txt, \ construct_bbox_corners, eval_ref_one_sample import einops SCAN_DATA = {} class ScanNetBase(Dataset): def __init__(self, cfg, split): self.cfg = cfg self.split = split self.base_dir = cfg.data.scan_family_base assert self.split in ['train', 'val', 'test'] self.int2cat = json.load(open(os.path.join(self.base_dir, "annotations/meta_data/scannetv2_raw_categories.json"), 'r', encoding="utf-8")) self.cat2int = {w: i for i, w in enumerate(self.int2cat)} self.label_converter = LabelConverter(os.path.join(self.base_dir, "annotations/meta_data/scannetv2-labels.combined.tsv")) self.rot_matrix = build_rotate_mat(self.split) self.use_cache = rgetattr(self.cfg.data, 'mvdatasettings.use_cache', False) self.cache = {} def __len__(self): return len(self.lang_data) def __getitem__(self, index): raise NotImplementedError def _load_one_scan(self, scan_id, pc_type = 'gt', load_inst_info = False, load_multiview_info = False, load_mask3d_voxel = False, load_pc_info = True, load_segment_info=False, load_offline_segment_voxel=False, load_offline_segment_image=False, load_offline_segment_point=False, load_nocls=False): one_scan = {} if load_inst_info: inst_labels, inst_locs, inst_colors = self._load_inst_info(scan_id) one_scan['inst_labels'] = inst_labels # (n_obj, ) one_scan['inst_locs'] = inst_locs # (n_obj, 6) center xyz, whl one_scan['inst_colors'] = inst_colors # (n_obj, 3x4) cluster * (weight, mean rgb) if load_pc_info: # load pcd data pcd_data = torch.load(os.path.join(self.base_dir, "scan_data", "pcd_with_global_alignment", f'{scan_id}.pth')) points, colors, instance_labels = pcd_data[0], pcd_data[1], pcd_data[-1] colors = colors / 127.5 - 1 pcds = np.concatenate([points, colors], 1) # convert to gt object if load_inst_info: obj_pcds = [] bg_indices = np.full((points.shape[0], ), 1, dtype=np.bool_) for i in range(instance_labels.max() + 1): mask = instance_labels == i # time consuming obj_pcds.append(pcds[mask]) if not self.int2cat[inst_labels[i]] in ['wall', 'floor', 'ceiling']: bg_indices[mask] = False one_scan['obj_pcds'] = obj_pcds # assert sum([len(obj_pcd) for obj_pcd in obj_pcds]) + bg_indices.sum() == points.shape[0] one_scan['bg_pcds'] = pcds[bg_indices] # calculate box for matching obj_center = [] obj_box_size = [] for obj_pcd in obj_pcds: _c, _b = convert_pc_to_box(obj_pcd) obj_center.append(_c) obj_box_size.append(_b) one_scan['obj_center'] = obj_center one_scan['obj_box_size'] = obj_box_size obj_mask_path = os.path.join(self.base_dir, "mask", str(scan_id) + ".mask" + ".npz") if os.path.exists(obj_mask_path): obj_label_path = os.path.join(self.base_dir, "mask", str(scan_id) + ".label" + ".npy") obj_pcds = [] obj_mask = np.array(sparse.load_npz(obj_mask_path).todense())[:50, :] obj_labels = np.load(obj_label_path)[:50] obj_l = [] bg_indices = np.full((pcds.shape[0], ), 1, dtype=np.bool_) for i in range(obj_mask.shape[0]): mask = obj_mask[i] if pcds[mask == 1, :].shape[0] > 0: obj_pcds.append(pcds[mask == 1, :]) obj_l.append(obj_labels[i]) # if not self.int2cat[obj_labels[i]] in ['wall', 'floor', 'ceiling']: bg_indices[mask == 1] = False one_scan['obj_pcds_pred'] = obj_pcds one_scan['inst_labels_pred'] = obj_l one_scan['bg_pcds_pred'] = pcds[bg_indices] # calculate box for pred obj_center_pred = [] obj_box_size_pred = [] for obj_pcd in obj_pcds: _c, _b = convert_pc_to_box(obj_pcd) obj_center_pred.append(_c) obj_box_size_pred.append(_b) one_scan['obj_center_pred'] = obj_center_pred one_scan['obj_box_size_pred'] = obj_box_size_pred if load_multiview_info: one_scan['multiview_info'] = self._load_multiview_info(scan_id) if load_mask3d_voxel: one_scan['mask3d_voxel'] = self._load_mask3d_voxel(scan_id) # load segment for mask3d if load_segment_info: one_scan["scene_pcds"] = np.load(os.path.join(self.base_dir, "scan_data", "pcd_mask3d", f'{scan_id[-7:]}.npy')) # load offline feature if load_offline_segment_voxel: if load_nocls: one_scan['offline_segment_voxel'] = torch.load(os.path.join(self.base_dir, "scan_data", "mask3d_voxel_feature_nocls", f'{scan_id}.pth')) else: one_scan['offline_segment_voxel'] = torch.load(os.path.join(self.base_dir, "scan_data", "mask3d_voxel_feature", f'{scan_id}.pth')) if load_offline_segment_image: one_scan['offline_segment_image'] = torch.load(os.path.join(self.base_dir, "scan_data", "mask3d_image_feature", f'{scan_id}.pth')) return (scan_id, one_scan) def _load_scannet(self, scan_ids, pc_type = 'gt', load_inst_info = False, load_multiview_info = False, load_mask3d_voxel = False, load_pc_info = True, load_segment_info = False, load_offline_segment_voxel=False, load_offline_segment_image=False, load_offline_segment_point=False, load_nocls=False, process_num = 1): unloaded_scan_ids = [scan_id for scan_id in scan_ids if scan_id not in SCAN_DATA] print(f"Loading scans: {len(unloaded_scan_ids)} / {len(scan_ids)}") scans = {} if process_num > 1: from joblib import Parallel, delayed res_all = Parallel(n_jobs=process_num)( delayed(self._load_one_scan)(scan_id, pc_type = pc_type, load_inst_info = load_inst_info, load_multiview_info = load_multiview_info, load_mask3d_voxel = load_mask3d_voxel, load_offline_segment_voxel=load_offline_segment_voxel, load_offline_segment_image=load_offline_segment_image, load_offline_segment_point=load_offline_segment_point, load_nocls=load_nocls, load_pc_info = load_pc_info, load_segment_info=load_segment_info) for scan_id in tqdm(unloaded_scan_ids)) for scan_id, one_scan in tqdm(res_all): scans[scan_id] = one_scan else: for scan_id in tqdm(unloaded_scan_ids): _, one_scan = self._load_one_scan(scan_id, pc_type = pc_type, load_inst_info = load_inst_info, load_multiview_info = load_multiview_info, load_mask3d_voxel = load_mask3d_voxel, load_pc_info = load_pc_info, load_segment_info=load_segment_info, load_offline_segment_voxel=load_offline_segment_voxel, load_offline_segment_image=load_offline_segment_image, load_offline_segment_point=load_offline_segment_point, load_nocls=load_nocls) scans[scan_id] = one_scan SCAN_DATA.update(scans) scans = {scan_id: SCAN_DATA[scan_id] for scan_id in scan_ids} return scans def _load_lang(self, cfg): caption_source = cfg.sources lang_data = [] if caption_source: if 'scanrefer' in caption_source: anno_file = os.path.join(self.base_dir, 'annotations/refer/scanrefer.jsonl') with jsonlines.open(anno_file, 'r') as _f: for item in _f: if item['scan_id'] in self.scannet_scan_ids: lang_data.append(('scannet', item['scan_id'], item['utterance'])) if 'referit3d' in caption_source: for anno_type in cfg.referit3d.anno_type: anno_file = os.path.join(self.base_dir, f'annotations/refer/{anno_type}.jsonl') with jsonlines.open(anno_file, 'r') as _f: for item in _f: if item['scan_id'] in self.scannet_scan_ids: lang_data.append(('scannet', item['scan_id'], item['utterance'])) if 'scanqa' in caption_source: anno_file_list = ['annotations/qa/ScanQA_v1.0_train.json', 'annotations/qa/ScanQA_v1.0_val.json'] for anno_file in anno_file_list: anno_file = os.path.join(self.base_dir, anno_file) json_data = json.load(open(anno_file, 'r', encoding='utf-8')) for item in json_data: if item['scene_id'] in self.scannet_scan_ids: for i in range(len(item['answers'])): lang_data.append(('scannet', item['scene_id'], item['question'] + " " + item['answers'][i])) if 'sgrefer' in caption_source: for anno_type in cfg.sgrefer.anno_type: anno_file = os.path.join(self.base_dir, f'annotations/refer/ssg_ref_{anno_type}.json') json_data = json.load(open(anno_file, 'r', encoding='utf-8')) for item in json_data: if item['scan_id'] in self.scannet_scan_ids: lang_data.append(('scannet', item['scan_id'], item['utterance'])) if 'sgcaption' in caption_source: for anno_type in cfg.sgcaption.anno_type: anno_file = os.path.join(self.base_dir, f'annotations/refer/ssg_caption_{anno_type}.json') json_data = json.load(open(anno_file, 'r', encoding='utf-8')) for item in json_data: if item['scan_id'] in self.scannet_scan_ids: lang_data.append(('scannet', item['scan_id'], item['utterance'])) return lang_data def _load_split(self, cfg, split, use_multi_process = False): if use_multi_process and split in ['train']: split_file = os.path.join(self.base_dir, 'annotations/splits/scannetv2_'+ split + "_sort.json") with open(split_file, 'r') as f: scannet_scan_ids = json.load(f) else: split_file = os.path.join(self.base_dir, 'annotations/splits/scannetv2_'+ split + ".txt") scannet_scan_ids = {x.strip() for x in open(split_file, 'r', encoding="utf-8")} scannet_scan_ids = sorted(scannet_scan_ids) if cfg.debug.flag and cfg.debug.debug_size != -1: scannet_scan_ids = list(scannet_scan_ids)[:cfg.debug.debug_size] return scannet_scan_ids def _load_inst_info(self, scan_id): # inst_labels = json.load(open(os.path.join(self.base_dir, 'scan_data', # 'instance_id_to_label', # f'{scan_id}.json'), encoding="utf-8")) inst_labels = torch.load(os.path.join(self.base_dir, 'scan_data', 'instance_id_to_label', f'{scan_id}.pth')) inst_labels = [self.cat2int[i] for i in inst_labels.values()] inst_loc_path = os.path.join(self.base_dir, 'scan_data', 'instance_id_to_loc', f'{scan_id}.npy') if os.path.exists(inst_loc_path): inst_locs = np.load(inst_loc_path) else: inst_locs = None inst_color_path = os.path.join(self.base_dir, 'scan_data', 'instance_id_to_gmm_color', f'{scan_id}.json') if os.path.exists(inst_color_path): inst_colors = json.load(open(inst_color_path, encoding="utf-8")) inst_colors = [np.concatenate( [np.array(x['weights'])[:, None], np.array(x['means'])], axis=1).astype(np.float32) for x in inst_colors] else: inst_colors = None return inst_labels, inst_locs, inst_colors def _obj_processing_post(self, obj_pcds, obj_labels, is_need_bbox=False, rot_aug=True): # rotate obj rot_matrix = build_rotate_mat(self.split, rot_aug) # normalize pc and calculate location obj_fts = [] obj_locs = [] obj_boxes = [] for obj_pcd in obj_pcds: # build locs if rot_matrix is not None: obj_pcd[:, :3] = np.matmul(obj_pcd[:, :3], rot_matrix.transpose()) obj_center = obj_pcd[:, :3].mean(0) obj_size = obj_pcd[:, :3].max(0) - obj_pcd[:, :3].min(0) obj_locs.append(np.concatenate([obj_center, obj_size], 0)) # build box if is_need_bbox: obj_box_center = (obj_pcd[:, :3].max(0) + obj_pcd[:, :3].min(0)) / 2 obj_box_size = obj_pcd[:, :3].max(0) - obj_pcd[:, :3].min(0) obj_boxes.append(np.concatenate([obj_box_center, obj_box_size], 0)) # subsample pcd_idxs = np.random.choice(len(obj_pcd), size=self.num_points, replace=len(obj_pcd) < self.num_points) obj_pcd = obj_pcd[pcd_idxs] # normalize obj_pcd[:, :3] = obj_pcd[:, :3] - obj_pcd[:, :3].mean(0) max_dist = np.max(np.sqrt(np.sum(obj_pcd[:, :3]**2, 1))) if max_dist < 1e-6: # take care of tiny point-clouds, i.e., padding max_dist = 1 obj_pcd[:, :3] = obj_pcd[:, :3] / max_dist obj_fts.append(obj_pcd) # convert to torch obj_fts = torch.from_numpy(np.stack(obj_fts, 0)) obj_locs = torch.from_numpy(np.array(obj_locs)) obj_boxes = torch.from_numpy(np.array(obj_boxes)) obj_labels = torch.LongTensor(obj_labels) assert obj_labels.shape[0] == obj_locs.shape[0] assert obj_fts.shape[0] == obj_locs.shape[0] return obj_fts, obj_locs, obj_boxes, obj_labels def _obj_processing_aug(self, obj_pcds, obj_labels, is_need_bbox=False): # augment objects if self.augmentor: data_dict = self.augmentor.forward({'obj_pcds': obj_pcds, 'num_points': self.num_points}) obj_pcds = data_dict['obj_pcds'] if isinstance(obj_pcds, list): obj_pcds = torch.Tensor(np.array(obj_pcds)) obj_sizes = torch.Tensor(np.array(data_dict['obj_sizes'])) xyz = obj_pcds[:, :, :3] center = xyz.mean(1) xyz_min = xyz.min(1).values xyz_max = xyz.max(1).values box_center = (xyz_min + xyz_max) / 2 size = torch.Tensor(obj_sizes) # size = xyz_max - xyz_min obj_locs = torch.cat([center, size], dim=1) obj_boxes = torch.cat([box_center, size], dim=1) # centering obj_pcds[:, :, :3].sub_(obj_pcds[:, :, :3].mean(1, keepdim=True)) # normalization max_dist = (obj_pcds[:, :, :3]**2).sum(2).sqrt().max(1).values max_dist.clamp_(min=1e-6) obj_pcds[:, :, :3].div_(max_dist[:, None, None]) # convert to torch obj_labels = torch.LongTensor(obj_labels) assert obj_labels.shape[0] == obj_locs.shape[0] return obj_pcds, obj_locs, obj_boxes, obj_labels def _scene_processing_aug(self, obj_pcds, bg_pcds, obj_labels, is_need_bbox=False): obj_len = len(obj_pcds) # sample background points fg_points_num = len(obj_pcds) * self.num_points assert fg_points_num < self.max_pcd_num_points bg_points_num = min(self.max_pcd_num_points - fg_points_num, self.bg_points_num) assert len(bg_pcds) > 0 assert bg_points_num > 0 bg_points_indices = np.random.choice(len(bg_pcds), size=bg_points_num, replace=len(bg_pcds) < bg_points_num) bg_pcds = bg_pcds[bg_points_indices] # augment objects if self.augmentor: data_dict = self.augmentor.forward({'obj_pcds': obj_pcds, 'bg_pcds': torch.Tensor(bg_pcds), 'num_points': self.num_points}) obj_pcds = data_dict['obj_pcds'] if isinstance(obj_pcds, list): obj_pcds = torch.Tensor(np.array(obj_pcds)) obj_sizes = torch.Tensor(np.array(data_dict['obj_sizes'])) bg_pcds = data_dict['bg_pcds'] assert len(obj_pcds) * obj_pcds[0].shape[0] == fg_points_num scene_pcds = np.vstack([np.array(obj_pcds.reshape(-1, 6)), np.array(bg_pcds)]) xyz = obj_pcds[:, :, :3] center = xyz.mean(1) xyz_min = xyz.min(1).values xyz_max = xyz.max(1).values box_center = (xyz_min + xyz_max) / 2 size = torch.Tensor(obj_sizes) # size = xyz_max - xyz_min obj_locs = torch.cat([center, size], dim=1) obj_boxes = torch.cat([box_center, size], dim=1) # centering obj_pcds[:, :, :3].sub_(obj_pcds[:, :, :3].mean(1, keepdim=True)) # normalization max_dist = (obj_pcds[:, :, :3]**2).sum(2).sqrt().max(1).values max_dist.clamp_(min=1e-6) obj_pcds[:, :, :3].div_(max_dist[:, None, None]) # generate obj point indices masks obj_pcds_masks = [] offset = 0 for _j in range(obj_len): mask = np.arange(self.num_points) + offset assert len(mask) == len(obj_pcds[_j]) obj_pcds_masks.append(mask) offset += self.num_points # convert to torch obj_labels = torch.LongTensor(obj_labels) obj_pcds_masks = torch.from_numpy(np.array(obj_pcds_masks)) assert obj_labels.shape[0] == obj_locs.shape[0] assert obj_pcds_masks.shape[0] == obj_locs.shape[0] return obj_locs, obj_boxes, obj_labels, obj_pcds_masks, scene_pcds def _get_pooling_obj_feature(self, args, mv_info_all, sampled_frame_names, scan_id): obj_dict = {} for i in range(len(sampled_frame_names)): frame_info = mv_info_all[sampled_frame_names[i]] inst_all = [x for x in frame_info['instance_info'] if x['is_need_process']] for one_inst in inst_all: tmp_inst_id = one_inst['org_inst_id'] feat = one_inst[args.inst_feat_type] feat = feat[0] if len(feat) == 1 else feat inst_id = self.label_converter.orgInstID_to_id[tmp_inst_id] if inst_id in obj_dict.keys(): obj_dict[inst_id]['feat'].append(feat) assert self.scan_data[scan_id]['inst_labels'][inst_id] == obj_dict[inst_id]['label'] else: obj_pcd = self.scan_data[scan_id]['obj_pcds'][inst_id] if self.rot_matrix is not None: obj_pcd[:, :3] = np.matmul(obj_pcd[:, :3], self.rot_matrix.transpose()) obj_center = obj_pcd[:, :3].mean(0) obj_size = obj_pcd[:, :3].max(0) - obj_pcd[:, :3].min(0) obj_loc = np.concatenate([obj_center, obj_size], 0) obj_box_center = (obj_pcd[:, :3].max(0) + obj_pcd[:, :3].min(0)) / 2 obj_box_size = obj_pcd[:, :3].max(0) - obj_pcd[:, :3].min(0) obj_box = np.concatenate([obj_box_center, obj_box_size], 0) obj_dict[inst_id] = { 'feat': [feat], 'location': obj_loc, 'label': self.scan_data[scan_id]['inst_labels'][inst_id], 'box' : obj_box, } if args.pooling_strategy == 'average_all': for key in obj_dict.keys(): feat_all = np.array(obj_dict[key]['feat']) if args.pooling_strategy == 'average_all': obj_dict[key]['feat'] = np.mean(feat_all, axis = 0) return obj_dict def init_dataset_params(self, dataset_cfg): if dataset_cfg is None: dataset_cfg = {} self.pc_type = dataset_cfg.get('pc_type', 'gt') self.sem_type = dataset_cfg.get('sem_type', '607') self.max_obj_len = dataset_cfg.get('max_obj_len', 80) self.num_points = dataset_cfg.get('num_points', 1024) self.filter_lang = dataset_cfg.get('filter_lang', False) self.rot_aug = dataset_cfg.get('rot_aug', True) self.train_duplicate = dataset_cfg.get('train_duplicate', 1) self.load_multiview_info = self.cfg.data.get('load_multiview_info', False) self.load_mask3d_voxel = self.cfg.data.get('load_mask3d_voxel', False) self.process_num = self.cfg.data.get('process_num', 20) assert self.pc_type in ['gt', 'pred'] assert self.sem_type in ['607'] def init_scan_data(self): self.scan_data = self._load_scannet(self.scan_ids, self.pc_type, load_inst_info=self.split!='test', load_multiview_info = self.load_multiview_info, load_mask3d_voxel = self.load_mask3d_voxel, process_num = self.process_num ) # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def get_scene(self, scan_id, tgt_object_id_list, tgt_object_name_list, sentence): if not isinstance(tgt_object_id_list, list): tgt_object_id_list = [tgt_object_id_list] if not isinstance(tgt_object_name_list, list): tgt_object_name_list = [tgt_object_name_list] tgt_obj_boxes = [np.concatenate(convert_pc_to_box(self.scan_data[scan_id]["obj_pcds"][i])) for i in tgt_object_id_list] # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # get obj labels by matching gt_obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_center = self.scan_data[scan_id]['obj_center'] obj_box_size = self.scan_data[scan_id]['obj_box_size'] obj_center_pred = self.scan_data[scan_id]['obj_center_pred'] obj_box_size_pred = self.scan_data[scan_id]['obj_box_size_pred'] for i, _ in enumerate(obj_center_pred): for j, _ in enumerate(obj_center): if eval_ref_one_sample(construct_bbox_corners(obj_center[j], obj_box_size[j]), construct_bbox_corners(obj_center_pred[i], obj_box_size_pred[i])) >= 0.25: obj_labels[i] = gt_obj_labels[j] break # filter out background or language # do not filter for predicted labels, because these labels are not accurate excluded_labels = ['wall', 'floor', 'ceiling'] def keep_obj(i, obj_label): if self.pc_type != 'gt' or i in tgt_object_id_list: return True category = self.int2cat[obj_label] if category in excluded_labels: return False if self.filter_lang and category not in sentence: return False return True selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if keep_obj(i, obj_label)] # build tgt object id and box if self.pc_type == 'gt': tgt_object_label_list = [obj_labels[x] for x in tgt_object_id_list] tgt_object_id_iou25_list = tgt_object_id_list tgt_object_id_iou50_list = tgt_object_id_list # for i, _ in enumerate(tgt_object_label_list): # assert self.int2cat[tgt_object_label_list[i]] == tgt_object_name_list[i] elif self.pc_type == 'pred': tgt_object_label_list = [self.cat2int[x] for x in tgt_object_name_list] tgt_object_id_list_matched = [] tgt_object_id_iou25_list = [] tgt_object_id_iou50_list = [] for cur_id in tgt_object_id_list: gt_pcd = self.scan_data[scan_id]["obj_pcds"][cur_id] gt_center, gt_box_size = convert_pc_to_box(gt_pcd) max_iou = -1 for i in selected_obj_idxs: obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) iou = eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center, gt_box_size)) if iou > max_iou: max_iou = iou tgt_object_id_matched = i # find tgt iou 25 if iou >= 0.25: tgt_object_id_iou25_list.append(i) # find tgt iou 50 if iou >= 0.5: tgt_object_id_iou50_list.append(i) tgt_object_id_list_matched.append(tgt_object_id_matched) tgt_object_id_list = tgt_object_id_list_matched tgt_object_id_list = list(set(tgt_object_id_list)) tgt_object_id_iou25_list = list(set(tgt_object_id_iou25_list)) tgt_object_id_iou50_list = list(set(tgt_object_id_iou50_list)) # crop objects to max_obj_len if self.max_obj_len < len(selected_obj_idxs): pre_selected_obj_idxs = selected_obj_idxs # select target first if len(tgt_object_id_list) > 0: selected_obj_idxs = tgt_object_id_list[:] selected_obj_idxs.extend(tgt_object_id_iou25_list) selected_obj_idxs.extend(tgt_object_id_iou50_list) selected_obj_idxs = list(set(selected_obj_idxs)) # select object with same semantic class with tgt_object remained_obj_idx = [] for i in pre_selected_obj_idxs: label = obj_labels[i] if i not in selected_obj_idxs: if label in tgt_object_label_list: selected_obj_idxs.append(i) else: remained_obj_idx.append(i) if len(selected_obj_idxs) >= self.max_obj_len: break if len(selected_obj_idxs) < self.max_obj_len: random.shuffle(remained_obj_idx) selected_obj_idxs += remained_obj_idx[:(self.max_obj_len - len(selected_obj_idxs))] # assert len(selected_obj_idxs) == self.max_obj_len # reorganize ids tgt_object_id_list = [selected_obj_idxs.index(id) for id in tgt_object_id_list] tgt_object_id_iou25_list = [selected_obj_idxs.index(id) for id in tgt_object_id_iou25_list] tgt_object_id_iou50_list = [selected_obj_idxs.index(id) for id in tgt_object_id_iou50_list] # build unique multiple is_multiple = sum([self.scan_data[scan_id]['label_count'][self.label_converter.id_to_scannetid[x]] for x in tgt_object_label_list]) > 1 obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True, rot_aug=self.rot_aug) data_dict = { "scan_id": scan_id, "tgt_object_id": torch.LongTensor(tgt_object_id_list), "tgt_object_label": torch.LongTensor(tgt_object_label_list), "tgt_obj_boxes": tgt_obj_boxes, # only use it for evaluation, because it is w/o augmentation. "obj_fts": obj_fts, "obj_locs": obj_locs, "obj_labels": obj_labels, "obj_boxes": obj_boxes, "tgt_object_id_iou25": torch.LongTensor(tgt_object_id_iou25_list), "tgt_object_id_iou50": torch.LongTensor(tgt_object_id_iou50_list), 'is_multiple': is_multiple } if 'multiview_info' in self.scan_data[scan_id]: mv_out_dict = self._get_multiview_info(scan_id) obj_mv_fts = [mv_out_dict[oid]['feat'] if oid in mv_out_dict else np.zeros_like(next(iter(mv_out_dict.values()))['feat']) for oid in selected_obj_idxs] data_dict['obj_mv_fts'] = torch.from_numpy(np.array(obj_mv_fts)).float() if 'mask3d_voxel' in self.scan_data[scan_id]: voxel_out_dict = self.scan_data[scan_id]['mask3d_voxel'] obj_voxel_fts = [voxel_out_dict[id] if id in voxel_out_dict else np.zeros_like(next(iter(voxel_out_dict.values()))) for id in selected_obj_idxs] data_dict['obj_voxel_fts'] = torch.from_numpy(np.array(obj_voxel_fts)).float() return data_dict ================================================ FILE: data/datasets/scannet_old.py ================================================ import os import collections import json import random import jsonlines from tqdm import tqdm import numpy as np import albumentations as A import volumentations as V import torch from torch.utils.data import Dataset from pathlib import Path from copy import deepcopy from ..build import DATASET_REGISTRY from ..data_utils import convert_pc_to_box, ScanQAAnswer, SQA3DAnswer, construct_bbox_corners, \ eval_ref_one_sample, is_explicitly_view_dependent, get_sqa_question_type from .scannet_base import ScanNetBase @DATASET_REGISTRY.register() class ScanNetSQA3D(ScanNetBase): r""" questions json file: dict_keys(['info', 'license', 'data_type', 'data_subtype', 'task_type', 'questions']) 'questions': List 'questions'[0]: { 'scene_id': 'scene0050_00', 'situation': 'I am standing by the ottoman on my right facing a couple of toolboxes.', 'alternative_situation': [ 'I just placed two backpacks on the ottoman on my right side before I went to play the piano in front of me to the right.', 'I stood up from the ottoman and walked over to the piano ahead of me.' ], 'question': 'What instrument in front of me is ebony and ivory?', 'question_id': 220602000002 } annotations json file: dict_keys(['info', 'license', 'data_type', 'data_subtype', 'annotations']) 'annotations': List 'annotations'[0]: { 'scene_id': 'scene0050_00', 'question_type': 'N/A', 'answer_type': 'other', 'question_id': 220602000002, 'answers': [{'answer': 'piano', 'answer_confidence': 'yes', 'answer_id': 1}], 'rotation': {'_x': 0, '_y': 0, '_z': -0.9995736030415032, '_w': -0.02919952230128897}, 'position': {'x': 0.7110268899979686, 'y': -0.03219739162793617, 'z': 0} } """ def __init__(self, cfg, split): super().__init__(cfg, split) self.pc_type = cfg.data.args.pc_type self.sem_type = cfg.data.args.sem_type self.max_obj_len = cfg.data.args.max_obj_len - 1 self.num_points = cfg.data.args.num_points self.filter_lang = cfg.data.args.filter_lang self.rot_aug = cfg.data.args.rot_aug self.use_unanswer = cfg.data.get(self.__class__.__name__).get(split).use_unanswer assert self.pc_type in ['gt', 'pred'] assert self.sem_type in ['607'] assert self.split in ['train', 'val', 'test'] if self.split == 'train': self.pc_type = 'gt' # use test set for validation elif self.split == 'val': self.split = 'test' print(f"Loading ScanNet SQA3D {split}-set language") # build answer self.num_answers, self.answer_vocab, self.answer_cands = self.build_answer() # load annotations lang_data, self.scan_ids, self.scan_to_item_idxs = self._load_lang() if cfg.debug.flag: self.lang_data = [] self.scan_ids = sorted(list(self.scan_ids))[:cfg.debug.debug_size] for item in lang_data: if item['scene_id'] in self.scan_ids: self.lang_data.append(item) else: self.lang_data = lang_data # load question engine self.questions_map = self._load_question() print(f"Finish loading ScanNet SQA3D {split}-set language") # load scans print(f"Loading ScanNet SQA3D {split}-set scans") self.scan_data = self._load_scannet(self.scan_ids, self.pc_type, self.pc_type == 'gt') print(f"Finish loading ScanNet SQA3D {split}-set data") def __getitem__(self, index): item = self.lang_data[index] item_id = item['question_id'] scan_id = item['scene_id'] tgt_object_id_list = [] tgt_object_name_list = [] answer_list = [answer['answer'] for answer in item['answers']] answer_id_list = [self.answer_vocab.stoi(answer) for answer in answer_list if self.answer_vocab.stoi(answer) >= 0] if self.split == 'train': # augment with random situation for train situation = random.choice(self.questions_map[scan_id][item_id]['situation']) else: # fix for eval situation = self.questions_map[scan_id][item_id]['situation'][0] question = self.questions_map[scan_id][item_id]['question'] concat_sentence = situation + question question_type = get_sqa_question_type(question) # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # filter out background or language if self.filter_lang: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling']) and (self.int2cat[obj_label] in concat_sentence)] for _id in tgt_object_id_list: if _id not in selected_obj_idxs: selected_obj_idxs.append(_id) else: selected_obj_idxs = [i for i in range(len(obj_pcds))] else: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] else: selected_obj_idxs = [i for i in range(len(obj_pcds))] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # build tgt object id and box if self.pc_type == 'gt': tgt_object_id_list = [selected_obj_idxs.index(x) for x in tgt_object_id_list] tgt_object_label_list = [obj_labels[x] for x in tgt_object_id_list] for i in range(len(tgt_object_label_list)): assert self.int2cat[tgt_object_label_list[i]] == tgt_object_name_list[i] elif self.pc_type == 'pred': # build gt box gt_center = [] gt_box_size = [] for cur_id in tgt_object_id_list: gt_pcd = self.scan_data[scan_id]["obj_pcds"][cur_id] center, box_size = convert_pc_to_box(gt_pcd) gt_center.append(center) gt_box_size.append(box_size) # start filtering tgt_object_id_list = [] tgt_object_label_list = [] for i in range(len(obj_pcds)): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) for j in range(len(gt_center)): if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center[j], gt_box_size[j])) >= 0.25: tgt_object_id_list.append(i) tgt_object_label_list.append(self.cat2int[tgt_object_name_list[j]]) break assert(len(obj_pcds) == len(obj_labels)) # crop objects if self.max_obj_len < len(obj_labels): selected_obj_idxs = tgt_object_id_list.copy() remained_obj_idx = [] for kobj, klabel in enumerate(obj_labels): if kobj not in tgt_object_id_list: if klabel in tgt_object_label_list: selected_obj_idxs.append(kobj) else: remained_obj_idx.append(kobj) if len(selected_obj_idxs) == self.max_obj_len: break if len(selected_obj_idxs) < self.max_obj_len: random.shuffle(remained_obj_idx) selected_obj_idxs += remained_obj_idx[:(self.max_obj_len - len(selected_obj_idxs))] obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] tgt_object_id_list = [i for i in range(len(tgt_object_id_list))] assert len(obj_pcds) == self.max_obj_len # rebuild tgt_object_id if len(tgt_object_id_list) == 0: tgt_object_id_list.append(len(obj_pcds)) tgt_object_label_list.append(5) obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True, rot_aug=self.rot_aug) # convert answer format answer_label = torch.zeros(self.num_answers).long() for _id in answer_id_list: answer_label[_id] = 1 # tgt object id tgt_object_id = torch.zeros(len(obj_fts) + 1).long() # add 1 for pad place holder for _id in tgt_object_id_list: tgt_object_id[_id] = 1 # tgt object sematic if self.sem_type == '607': tgt_object_label = torch.zeros(607).long() else: raise NotImplementedError("semantic type " + self.sem_type) for _id in tgt_object_label_list: tgt_object_label[_id] = 1 data_dict = { "situation": situation, "situation_pos": item['position'], "situation_rot": item['rotation'], "question": question, "sentence": concat_sentence, "scan_dir": os.path.join(self.base_dir, 'scans'), "scan_id": scan_id, "answer": "[answer_seq]".join(answer_list), "answer_label": answer_label, # A "tgt_object_id": torch.LongTensor(tgt_object_id), # N "tgt_object_label": torch.LongTensor(tgt_object_label), # L "obj_fts": obj_fts, "obj_locs": obj_locs, "obj_labels": obj_labels, "obj_boxes": obj_boxes, # N, 6 "data_idx": item_id, "sqa_type": question_type } return data_dict def build_answer(self): answer_data = json.load( open(os.path.join(self.base_dir, 'annotations/sqa_task/answer_dict.json'), encoding='utf-8') )[0] answer_counter = [] for data in answer_data.keys(): answer_counter.append(data) answer_counter = collections.Counter(sorted(answer_counter)) num_answers = len(answer_counter) answer_cands = answer_counter.keys() answer_vocab = SQA3DAnswer(answer_cands) print(f"total answers is {num_answers}") return num_answers, answer_vocab, answer_cands def _load_lang(self): lang_data = [] scan_ids = set() scan_to_item_idxs = collections.defaultdict(list) anno_file = os.path.join(self.base_dir, f'annotations/sqa_task/balanced/v1_balanced_sqa_annotations_{self.split}_scannetv2.json') json_data = json.load(open(anno_file, 'r', encoding='utf-8'))['annotations'] for item in json_data: if self.use_unanswer or (len(set(item['answers']) & set(self.answer_cands)) > 0): scan_ids.add(item['scene_id']) scan_to_item_idxs[item['scene_id']].append(len(lang_data)) lang_data.append(item) print(f'{self.split} unanswerable question {len(json_data) - len(lang_data)},' + f'answerable question {len(lang_data)}') return lang_data, scan_ids, scan_to_item_idxs def _load_question(self): questions_map = {} anno_file = os.path.join(self.base_dir, f'annotations/sqa_task/balanced/v1_balanced_questions_{self.split}_scannetv2.json') json_data = json.load(open(anno_file, 'r', encoding='utf-8'))['questions'] for item in json_data: if item['scene_id'] not in questions_map.keys(): questions_map[item['scene_id']] = {} questions_map[item['scene_id']][item['question_id']] = { 'situation': [item['situation']] + item['alternative_situation'], # list of sentences 'question': item['question'] # sentence } return questions_map @DATASET_REGISTRY.register() class ScanNetScanQAOld(ScanNetBase): def __init__(self, cfg, split): super(ScanNetScanQAOld, self).__init__(cfg, split) self.pc_type = cfg.data.args.pc_type self.sem_type = cfg.data.args.sem_type self.max_obj_len = cfg.data.args.max_obj_len - 1 self.num_points = cfg.data.args.num_points self.filter_lang = cfg.data.args.filter_lang self.use_unanswer = cfg.data.get(self.__class__.__name__).get(split).use_unanswer assert self.pc_type in ['gt', 'pred'] assert self.sem_type in ['607'] assert self.split in ['train', 'val', 'test'] if self.split == 'train': self.pc_type = 'gt' # TODO: hack test split to be the same as val if self.split == 'test': self.split = cfg.data.ScanNetScanQAOld.test.test_file self.is_test = ('test' in self.split) print(f"Loading ScanNet ScanQA {split}-set language") self.num_answers, self.answer_vocab, self.answer_cands = self.build_answer() lang_data, self.scan_ids, self.scan_to_item_idxs = self._load_lang() if cfg.debug.flag and cfg.debug.debug_size != -1: self.lang_data = [] self.scan_ids = sorted(list(self.scan_ids))[:cfg.debug.debug_size] for item in lang_data: if item['scene_id'] in self.scan_ids: self.lang_data.append(item) else: self.lang_data = lang_data print(f"Finish loading ScanNet ScanQA {split}-set language") print(f"Loading ScanNet ScanQA {split}-set scans") self.scan_data = self._load_scannet(self.scan_ids, self.pc_type, load_inst_info=('test' not in self.split)) print(f"Finish loading ScanNet ScanQA {split}-set data") def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ item = self.lang_data[index] item_id = item['question_id'] # item_id = ''.join([i for i in item_id if i.isdigit()]) # item_id = int(item_id[:-1].lstrip('0') + item_id[-1]) scan_id = item['scene_id'] if not self.is_test: tgt_object_id_list = item['object_ids'] tgt_object_name_list = item['object_names'] answer_list = item['answers'] answer_id_list = [self.answer_vocab.stoi(answer) for answer in answer_list if self.answer_vocab.stoi(answer) >= 0] else: tgt_object_id_list = [] tgt_object_name_list = [] answer_list = [] answer_id_list = [] question = item['question'] # load pcds and labels if self.pc_type == 'gt': obj_pcds = self.scan_data[scan_id]['obj_pcds'] # N, 6 obj_labels = self.scan_data[scan_id]['inst_labels'] # N elif self.pc_type == 'pred': obj_pcds = self.scan_data[scan_id]['obj_pcds_pred'] obj_labels = self.scan_data[scan_id]['inst_labels_pred'] # get obj labels by matching if not self.is_test: gt_obj_labels = self.scan_data[scan_id]['inst_labels'] # N obj_center = self.scan_data[scan_id]['obj_center'] obj_box_size = self.scan_data[scan_id]['obj_box_size'] obj_center_pred = self.scan_data[scan_id]['obj_center_pred'] obj_box_size_pred = self.scan_data[scan_id]['obj_box_size_pred'] for i, _ in enumerate(obj_center_pred): for j, _ in enumerate(obj_center): if eval_ref_one_sample(construct_bbox_corners(obj_center[j], obj_box_size[j]), construct_bbox_corners(obj_center_pred[i], obj_box_size_pred[i])) >= 0.25: obj_labels[i] = gt_obj_labels[j] break # filter out background or language if self.filter_lang: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling']) and (self.int2cat[obj_label] in question)] for _id in tgt_object_id_list: if _id not in selected_obj_idxs: selected_obj_idxs.append(_id) else: selected_obj_idxs = [i for i in range(len(obj_pcds))] else: if self.pc_type == 'gt': selected_obj_idxs = [i for i, obj_label in enumerate(obj_labels) if (self.int2cat[obj_label] not in ['wall', 'floor', 'ceiling'])] else: selected_obj_idxs = [i for i in range(len(obj_pcds))] obj_pcds = [obj_pcds[id] for id in selected_obj_idxs] obj_labels = [obj_labels[id] for id in selected_obj_idxs] # build tgt object id and box if self.pc_type == 'gt': tgt_object_id_list = [selected_obj_idxs.index(x) for x in tgt_object_id_list] tgt_object_label_list = [obj_labels[x] for x in tgt_object_id_list] for i, _ in enumerate(tgt_object_label_list): assert self.int2cat[tgt_object_label_list[i]] == tgt_object_name_list[i] elif self.pc_type == 'pred': # build gt box gt_center = [] gt_box_size = [] for cur_id in tgt_object_id_list: gt_pcd = self.scan_data[scan_id]["obj_pcds"][cur_id] center, box_size = convert_pc_to_box(gt_pcd) gt_center.append(center) gt_box_size.append(box_size) # start filtering tgt_object_id_list = [] tgt_object_label_list = [] for i, _ in enumerate(obj_pcds): obj_center, obj_box_size = convert_pc_to_box(obj_pcds[i]) for j, _ in enumerate(gt_center): if eval_ref_one_sample(construct_bbox_corners(obj_center, obj_box_size), construct_bbox_corners(gt_center[j], gt_box_size[j])) >= 0.25: tgt_object_id_list.append(i) tgt_object_label_list.append(self.cat2int[tgt_object_name_list[j]]) break assert(len(obj_pcds) == len(obj_labels)) # crop objects if self.max_obj_len < len(obj_labels): selected_obj_idxs = tgt_object_id_list.copy() remained_obj_idx = [] for kobj, klabel in enumerate(obj_labels): if kobj not in tgt_object_id_list: if klabel in tgt_object_label_list: selected_obj_idxs.append(kobj) else: remained_obj_idx.append(kobj) if len(selected_obj_idxs) == self.max_obj_len: break if len(selected_obj_idxs) < self.max_obj_len: random.shuffle(remained_obj_idx) selected_obj_idxs += remained_obj_idx[:(self.max_obj_len - len(selected_obj_idxs))] obj_pcds = [obj_pcds[i] for i in selected_obj_idxs] obj_labels = [obj_labels[i] for i in selected_obj_idxs] tgt_object_id_list = [i for i in range(len(tgt_object_id_list))] assert len(obj_pcds) == self.max_obj_len # rebuild tgt_object_id if len(tgt_object_id_list) == 0: tgt_object_id_list.append(len(obj_pcds)) tgt_object_label_list.append(5) obj_fts, obj_locs, obj_boxes, obj_labels = self._obj_processing_post(obj_pcds, obj_labels, is_need_bbox=True) # convert answer format answer_label = torch.zeros(self.num_answers) for _id in answer_id_list: answer_label[_id] = 1 # tgt object id tgt_object_id = torch.zeros(len(obj_fts) + 1) # add 1 for pad place holder for _id in tgt_object_id_list: tgt_object_id[_id] = 1 # tgt object sematic if self.sem_type == '607': tgt_object_label = torch.zeros(607) else: raise NotImplementedError("semantic type " + self.sem_type) for _id in tgt_object_label_list: tgt_object_label[_id] = 1 data_dict = { "sentence": question, "scan_dir": os.path.join(self.base_dir, 'scans'), "scan_id": scan_id, "answers": "[answer_seq]".join(answer_list), "answer_label": answer_label.float(), # A "tgt_object_id": tgt_object_id.float(), # N "tgt_object_label": tgt_object_label.float(), # L "obj_fts": obj_fts, "obj_locs": obj_locs, "obj_labels": obj_labels, "obj_boxes": obj_boxes, # N, 6 "data_idx": item_id } return data_dict def _load_lang(self): lang_data = [] scan_ids = set() scan_to_item_idxs = collections.defaultdict(list) anno_file = os.path.join(self.base_dir, f'annotations/qa/ScanQA_v1.0_{self.split}.json') json_data = json.load(open(anno_file, 'r', encoding='utf-8')) for item in json_data: if self.use_unanswer or (len(set(item['answers']) & set(self.answer_cands)) > 0): scan_ids.add(item['scene_id']) scan_to_item_idxs[item['scene_id']].append(len(lang_data)) lang_data.append(item) print(f'{self.split} unanswerable question {len(json_data) - len(lang_data)},' + f'answerable question {len(lang_data)}') return lang_data, scan_ids, scan_to_item_idxs def build_answer(self): train_data = json.load(open(os.path.join(self.base_dir, 'annotations/qa/ScanQA_v1.0_train.json'), encoding='utf-8')) answer_counter = sum([data['answers'] for data in train_data], []) answer_counter = collections.Counter(sorted(answer_counter)) num_answers = len(answer_counter) answer_cands = answer_counter.keys() answer_vocab = ScanQAAnswer(answer_cands) print(f"total answers is {num_answers}") return num_answers, answer_vocab, answer_cands ================================================ FILE: data/datasets/structure3d.py ================================================ import collections from ..build import DATASET_REGISTRY from .base import ScanBase @DATASET_REGISTRY.register() class S3DPretrainObj(ScanBase): def __init__(self, cfg, split): super(S3DPretrainObj, self).__init__(cfg, split) self.base_dir = cfg.data.s3d_base self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) self.scan_ids = sorted(list(self._load_split(self.split))) if cfg.debug.flag and cfg.debug.debug_size != -1: self.scan_ids = self.scan_ids[:cfg.debug.debug_size] print(f"Loading Structure3D {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) self.scan_ids = sorted(list(self.scan_data.keys())) print(f"Finish loading Structure3D {split}-set scans of length {len(self.scan_ids)}") def __len__(self): return len(self.scan_ids) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_obj_pretrain(index) dataset = 'rscan' data_dict['source'] = dataset return data_dict @DATASET_REGISTRY.register() class S3DSpatialRefer(ScanBase): def __init__(self, cfg, split): super(S3DSpatialRefer, self).__init__(cfg, split) self.base_dir = cfg.data.s3d_base self.max_obj_len = cfg.data.args.max_obj_len - 1 self.filter_lang = cfg.data.args.filter_lang self.load_scene_pcds = cfg.data.args.get('load_scene_pcds', False) if self.load_scene_pcds: self.max_pcd_num_points = cfg.data.args.get('max_pcd_num_points', None) assert self.max_pcd_num_points is not None self.bg_points_num = cfg.data.args.get('bg_points_num', 1000) split_cfg = cfg.data.get(self.__class__.__name__).get(split) all_scan_ids = self._load_split(self.split) print(f"Loading Structure3D SpatialRefer {split}-set language") self.lang_data, self.scan_ids = self._load_lang(split_cfg, all_scan_ids) print(f"Finish loading Structure3D SpatialRefer {split}-set language of size {self.__len__()}") print(f"Loading Structure3D {split}-set scans") self.scan_data = self._load_scan(self.scan_ids) print(f"Finish loading Structure3D {split}-set scans") # build unique multiple look up for scan_id in self.scan_ids: inst_labels = self.scan_data[scan_id]['inst_labels'] self.scan_data[scan_id]['label_count'] = collections.Counter( [l for l in inst_labels]) self.scan_data[scan_id]['label_count_multi'] = collections.Counter( [self.label_converter.id_to_scannetid[l] for l in inst_labels]) def __len__(self): return len(self.lang_data) def __getitem__(self, index): """Data dict post-processing, for example, filtering, crop, nomalization, rotation, etc. Args: index (int): _description_ """ data_dict = self._getitem_refer(index) return data_dict ================================================ FILE: evaluator/__init__.py ================================================ from .pretrain_eval import * from .referit3d_eval import * from .scanrefer_eval import * from .scanqa_eval import * from .objcls_eval import * from .sqa3d_eval import * ================================================ FILE: evaluator/build.py ================================================ import json import numpy as np from omegaconf import open_dict from fvcore.common.registry import Registry from common.misc import gather_dict EVALUATOR_REGISTRY = Registry("EVALUATOR") class BaseEvaluator(): def __init__(self, cfg, accelerator): self.accelerator = accelerator self.best_result = -np.inf self.save = cfg.eval.save self.save_dir.mkdir(parents=True, exist_ok=True) self.reset() def reset(self): self.eval_results = [] self.eval_dict = {} def batch_metrics(self, data_dict, include_count=False): raise NotImplementedError("Per batch metrics calculation is required for evaluation") def update(self, data_dict): metrics = self.batch_metrics(data_dict, include_count=True) for key in metrics.keys(): if key not in self.eval_dict: self.eval_dict[key] = [] self.eval_dict[key].append(metrics[key]) def record(self): self.eval_dict = gather_dict(self.accelerator, self.eval_dict) for k, metrics in self.eval_dict.items(): if not isinstance(metrics, list): continue # metrics is a list of (value, count) total_value = sum(x[0] for x in metrics) total_count = sum(x[1] for x in metrics) self.eval_dict[k] = total_value / max(total_count, 1) if self.save and self.accelerator.is_main_process: with (self.save_dir / "results.json").open("w") as f: json.dump(self.eval_results, f) self.eval_dict['target_metric'] = self.eval_dict[self.target_metric] if self.eval_dict["target_metric"] > self.best_result: is_best = True self.best_result = self.eval_dict["target_metric"] else: is_best = False self.eval_dict['best_result'] = self.best_result return is_best, self.eval_dict def get_eval(name, cfg, accelerator, **kwargs): """Get an evaluator or a list of evaluators.""" if isinstance(name, str): eval = EVALUATOR_REGISTRY.get(name)(cfg, accelerator, **kwargs) else: eval = [EVALUATOR_REGISTRY.get(i)(cfg, accelerator, **kwargs) for i in name] return eval def build_eval(cfg, accelerator, **kwargs): if cfg.eval.get("train", None) is not None: train_eval = get_eval(cfg.eval.train.name, cfg, accelerator, **kwargs) val_eval = get_eval(cfg.eval.val.name, cfg, accelerator, **kwargs) return {"train": train_eval, "val": val_eval} elif cfg.eval.get("name", None) is not None: return get_eval(cfg.eval.name, cfg, accelerator, **kwargs) else: with open_dict(cfg): cfg.eval.name = [cfg.data.get(dataset).evaluator for dataset in cfg.data.val] return get_eval(cfg.eval.name, cfg, accelerator, **kwargs) ================================================ FILE: evaluator/objcls_eval.py ================================================ import torch from pathlib import Path from evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator @EVALUATOR_REGISTRY.register() class PretrainObjEval(BaseEvaluator): def __init__(self, cfg, accelerator, **kwargs): self.target_metric = "accuracy" self.save_dir = Path(cfg.exp_dir) / "eval_results" / self.__class__.__name__ super().__init__(cfg, accelerator, **kwargs) def batch_metrics(self, data_dict, include_count=False): metrics = {} logits = data_dict["obj_logits"][data_dict["obj_masks"]].view(-1, data_dict["obj_logits"].shape[-1]) labels = data_dict["obj_labels"][data_dict["obj_masks"]].view(-1) _, pred = torch.max(logits, 1) metrics["accuracy"] = ((pred == labels.view(-1)).sum().item(), labels.shape[0]) if not include_count: for key, v in metrics.items(): metrics[key] = v[0] / max(v[1], 1) return metrics ================================================ FILE: evaluator/pretrain_eval.py ================================================ import torch import numpy as np from evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator @EVALUATOR_REGISTRY.register() class PretrainEval(BaseEvaluator): def __init__(self, cfg, accelerator, **kwargs): self.cfg = cfg self.eval_dict = { "target_metric": [], "og_acc": [], "lang_cls_acc_mask": [], "obj_cls_post_acc": [], "obj_cls_pre_acc": [], "obj_cls_raw_acc": [], "obj_cls_pre_acc_unmask": [], "obj_cls_pre_acc_mask": [], "obj_cls_post_acc_unmask": [], "obj_cls_post_acc_mask": [] } self.accelerator = accelerator self.device = self.accelerator.device self.total_count = 0 self.best_result = -np.inf def batch_metrics(self, data_dict): metrics = {} txt_token_mask = (data_dict['masked_lm_labels'] != -1) if 'tgt_object_id' in data_dict.keys(): metrics['og_acc'] = (torch.argmax(data_dict['og3d_logits'], dim=-1) == data_dict['tgt_object_id'].squeeze( 1)).sum().item() / float(len(data_dict['tgt_object_id'])) metrics['lang_cls_acc_mask'] = torch.sum( torch.argmax(data_dict['txt_lm_cls_logits'], dim=2)[txt_token_mask] == data_dict['masked_lm_labels'][ txt_token_mask]).item() / float(txt_token_mask.sum().item() + 1e-8) if 'obj_cls_post_logits' in data_dict.keys(): metrics['obj_cls_post_acc'] = torch.sum( torch.argmax(data_dict['obj_cls_post_logits'], dim=2)[data_dict['obj_masks']] == data_dict["obj_labels"][ data_dict['obj_masks']]).item() / float(data_dict['obj_masks'].sum().item() + 1e-8) metrics['obj_cls_post_acc_unmask'] = torch.sum( torch.argmax(data_dict['obj_cls_post_logits'], dim=2)[ data_dict['obj_masks'] * data_dict['obj_sem_masks']] == data_dict["obj_labels"][data_dict['obj_masks'] * data_dict['obj_sem_masks']]).item() / float( (data_dict['obj_masks'] * data_dict['obj_sem_masks']).sum().item() + 1e-8) metrics['obj_cls_post_acc_mask'] = torch.sum(torch.argmax(data_dict['obj_cls_post_logits'], dim=2)[ data_dict['obj_masks'] * data_dict[ 'obj_sem_masks'].logical_not()] == data_dict["obj_labels"][ data_dict['obj_masks'] * data_dict[ 'obj_sem_masks'].logical_not()]).item() / float( (data_dict['obj_masks'] * data_dict['obj_sem_masks'].logical_not()).sum().item() + 1e-8) if 'obj_cls_raw_logits' in data_dict.keys(): metrics['obj_cls_raw_acc'] = torch.sum( torch.argmax(data_dict['obj_cls_raw_logits'], dim=2)[data_dict['obj_masks']] == data_dict["obj_labels"][ data_dict['obj_masks']]).item() / float(data_dict['obj_masks'].sum().item() + 1e-8) if 'obj_cls_pre_logits' in data_dict.keys(): metrics['obj_cls_pre_acc'] = torch.sum( torch.argmax(data_dict['obj_cls_pre_logits'], dim=2)[data_dict['obj_masks']] == data_dict["obj_labels"][ data_dict['obj_masks']]).item() / float(data_dict['obj_masks'].sum().item() + 1e-8) metrics['obj_cls_pre_acc_unmask'] = torch.sum( torch.argmax(data_dict['obj_cls_pre_logits'], dim=2)[data_dict['obj_masks'] * data_dict['obj_sem_masks']] == data_dict["obj_labels"][data_dict['obj_masks'] * data_dict['obj_sem_masks']]).item() / float( (data_dict['obj_masks'] * data_dict['obj_sem_masks']).sum().item() + 1e-8) metrics['obj_cls_pre_acc_mask'] = torch.sum(torch.argmax(data_dict['obj_cls_pre_logits'], dim=2)[ data_dict['obj_masks'] * data_dict[ 'obj_sem_masks'].logical_not()] == data_dict["obj_labels"][ data_dict['obj_masks'] * data_dict[ 'obj_sem_masks'].logical_not()]).item() / float( (data_dict['obj_masks'] * data_dict['obj_sem_masks'].logical_not()).sum().item() + 1e-8) all_acc = [v for k, v in metrics.items()] metrics["target_metric"] = float(sum(all_acc)) / len(all_acc) metrics["total_count"] = data_dict["txt_lm_cls_logits"].shape[0] return metrics def update(self, data_dict): metrics = self.batch_metrics(data_dict) self.total_count += metrics["total_count"] for key in self.eval_dict.keys(): if key not in metrics.keys(): continue self.eval_dict[key].append(float(metrics[key]) * metrics["total_count"]) def record(self): # Average for k, v in self.eval_dict.items(): self.eval_dict[k] = sum(v) / self.total_count if self.eval_dict["target_metric"] > self.best_result: is_best = True self.best_result = self.eval_dict["target_metric"] else: is_best = False return is_best, self.eval_dict def reset(self): for key in self.eval_dict.keys(): self.eval_dict[key] = [] self.total_count = 0 ================================================ FILE: evaluator/referit3d_eval.py ================================================ from pathlib import Path import torch from evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator @EVALUATOR_REGISTRY.register() class ReferIt3DEval(BaseEvaluator): def __init__(self, cfg, accelerator, **kwargs): self.target_metric = 'og_acc' self.save_dir = Path(cfg.exp_dir) / "eval_results" / self.__class__.__name__ super().__init__(cfg, accelerator, **kwargs) def batch_metrics(self, data_dict, include_count=False): # Per-scene eval if len(data_dict['og3d_logits'].shape) == 3: data_dict['tgt_object_id'] = data_dict['tgt_object_id'].flatten(0, 1).unsqueeze(1) data_dict['is_hard'] = data_dict['is_hard'].flatten(0, 1) data_dict['is_view_dependent'] = data_dict['is_view_dependent'].flatten(0, 1) data_dict['og3d_logits'] = data_dict['og3d_logits'].flatten(0, 1) metrics = {} og_pred = torch.argmax(data_dict['og3d_logits'], dim=-1) total_count = len(og_pred) # Easy and hard counts hard_count = data_dict['is_hard'].sum().item() easy_count = total_count - hard_count # View-dependent and view-independent counts view_dep_count = data_dict['is_view_dependent'].sum().item() view_indep_count = total_count - view_dep_count # Correct counts correct_preds = data_dict['tgt_object_id'].flatten() == og_pred correct = correct_preds.sum().item() # Correct counts for easy and hard hard_correct = (correct_preds & data_dict['is_hard']).sum().item() easy_correct = correct - hard_correct # Correct counts for view-dependent and view-independent view_dep_correct = (correct_preds & data_dict['is_view_dependent']).sum().item() view_indep_correct = correct - view_dep_correct metrics['og_acc_easy'] = (easy_correct, easy_count) metrics['og_acc_hard'] = (hard_correct, hard_count) metrics['og_acc_view_dep'] = (view_dep_correct, view_dep_count) metrics['og_acc_view_indep'] = (view_indep_correct, view_indep_count) metrics['og_acc'] = (og_pred == data_dict['tgt_object_id'].squeeze(1)).sum().item() if 'txt_cls_logits' in data_dict: metrics['txt_acc'] = (torch.argmax(data_dict['txt_cls_logits'], dim=1) == data_dict["tgt_object_label"].squeeze(1)).sum().item() # get obj cls acc gt = data_dict['obj_labels'] mask = data_dict['obj_masks'] for key in data_dict.keys(): if key.endswith('logits') and data_dict[key].ndim == 3 and data_dict[key].shape[:2] == data_dict['obj_labels'].shape: new_key = key.replace('logits', 'acc') pred = torch.argmax(data_dict[key], dim=2) metrics[new_key] = ((pred[mask] == gt[mask]).sum().item(), data_dict['obj_masks'].sum().item()) for key in metrics: if isinstance(metrics[key], tuple): # already has count continue metrics[key] = (metrics[key], total_count) if self.save: item_ids = data_dict['data_idx'] for i in range(len(item_ids)): self.eval_results.append({ "scene_id": item_ids[i], "bbox": data_dict['obj_boxes'][i][og_pred[i]].cpu().numpy().tolist(), "correct": og_pred[i].item() == data_dict['tgt_object_id'][i].item() }) if not include_count: for key, v in metrics.items(): metrics[key] = v[0] / max(v[1], 1) return metrics ================================================ FILE: evaluator/scanqa_eval.py ================================================ import os import json import collections from pathlib import Path import torch from evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator from data.data_utils import ScanQAAnswer, clean_answer from common.box_utils import get_3d_box from evaluator.build import EVALUATOR_REGISTRY @EVALUATOR_REGISTRY.register() class ScanQAEval(BaseEvaluator): def __init__(self, cfg, accelerator, **kwargs): self.target_metric = 'ans1_acc' self.save_dir = Path(cfg.exp_dir) / "eval_results" / self.__class__.__name__ super().__init__(cfg, accelerator, **kwargs) if self.save: train_data = json.load(open(os.path.join(cfg.data.scan_family_base, 'annotations/qa/ScanQA_v1.0_train.json'), encoding='utf-8')) answer_counter = sum([data['answers'] for data in train_data], []) answer_counter = collections.Counter(sorted(answer_counter)) answer_cands = answer_counter.keys() self.answer_vocab = ScanQAAnswer(answer_cands) def batch_metrics(self, data_dict, include_count=False): metrics = {} total_count = len(data_dict['answer_scores']) # ans choice_1 = data_dict['answer_scores'].argmax(dim=-1) choice_10 = torch.topk(data_dict['answer_scores'].detach(), 10, -1)[1] correct1 = 0 correct10 = 0 for i in range(data_dict['answer_label'].shape[0]): if data_dict['answer_label'][i, choice_1[i]] == 1: correct1 += 1 for j in range(10): if data_dict['answer_label'][i, choice_10[i, j]] == 1: correct10 += 1 break metrics['ans1_acc'] = correct1 metrics['ans10_acc'] = correct10 # get obj cls acc for key in data_dict.keys(): if key.endswith('logits') and data_dict[key].ndim == 3 and data_dict[key].shape[:2] == data_dict['obj_labels'].shape: new_key = key.replace('logits', 'acc') pred = torch.argmax(data_dict[key], dim=2) gt = data_dict['obj_labels'] mask = data_dict['obj_masks'] metrics[new_key] = ((pred[mask] == gt[mask]).sum().item(), data_dict['obj_masks'].sum().item()) for key in metrics: if isinstance(metrics[key], tuple): # already has count continue metrics[key] = (metrics[key], total_count) if self.save: for i in range(total_count): answer_top10 = [self.answer_vocab.itos(choice_10[i, j].item()) for j in range(10)] og3d_pred = torch.argmax(data_dict['og3d_logits'], dim=1) box = data_dict['obj_boxes'][i, og3d_pred[i]].cpu().numpy() box_center = box[0:3] box_size = box[3:6] pred_data = { "scene_id": data_dict["scan_id"][i], "question_id": data_dict["data_idx"][i], "answer_top10": answer_top10, "bbox": get_3d_box(box_center, box_size).tolist() } self.eval_results.append(pred_data) if not include_count: for key, v in metrics.items(): metrics[key] = v[0] / max(v[1], 1) return metrics @EVALUATOR_REGISTRY.register() class ScanQAGenEval(ScanQAEval): def __init__(self, cfg, accelerator, **kwargs): super().__init__(cfg, accelerator, **kwargs) def batch_metrics(self, data_dict, include_count=False): metrics = {} answer_preds = [clean_answer(a) for a in data_dict['answer_pred']] answer_gts = [list(map(clean_answer, a)) for a in data_dict['answers']] correct = len([1 for pred, gts in zip(answer_preds, answer_gts) if pred in gts]) metrics['ans1_acc'] = (correct, len(answer_preds)) if not include_count: for key, v in metrics.items(): metrics[key] = v[0] / max(v[1], 1) return metrics ================================================ FILE: evaluator/scanrefer_eval.py ================================================ from pathlib import Path import torch from evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator @EVALUATOR_REGISTRY.register() class ScanReferEval(BaseEvaluator): def __init__(self, cfg, accelerator, **kwargs): self.target_metric = 'og_acc_iou25' self.save_dir = Path(cfg.exp_dir) / "eval_results" / self.__class__.__name__ super().__init__(cfg, accelerator, **kwargs) def batch_metrics(self, data_dict, include_count=False): # Per-scene eval if len(data_dict['tgt_object_id_iou25'].shape) == 3: data_dict['tgt_object_id_iou25'] = data_dict['tgt_object_id_iou25'].flatten(0, 1) data_dict['tgt_object_id_iou50'] = data_dict['tgt_object_id_iou50'].flatten(0, 1) data_dict['tgt_object_id'] = data_dict['tgt_object_id'].flatten(0, 1).unsqueeze(1) data_dict['is_multiple'] = data_dict['is_multiple'].flatten(0, 1) data_dict['og3d_logits'] = data_dict['og3d_logits'].flatten(0, 1) metrics = {} og_pred = torch.argmax(data_dict['og3d_logits'], dim=-1) total_count = len(og_pred) multiple_count = data_dict['is_multiple'].sum().item() unique_count = total_count - multiple_count # Correct counts for iou25 and iou50 iou25_correct_mask = data_dict['tgt_object_id_iou25'][torch.arange(len(og_pred)), og_pred].to(bool) iou50_correct_mask = data_dict['tgt_object_id_iou50'][torch.arange(len(og_pred)), og_pred].to(bool) iou25_correct = iou25_correct_mask.sum().item() iou50_correct = iou50_correct_mask.sum().item() # Correct counts for unique and multiple iou25 and iou50 iou25_multiple_correct = (iou25_correct_mask & data_dict['is_multiple']).sum().item() iou25_unique_correct = iou25_correct - iou25_multiple_correct iou50_multiple_correct = (iou50_correct_mask & data_dict['is_multiple']).sum().item() iou50_unique_correct = iou50_correct - iou50_multiple_correct metrics['og_acc_iou25'] = iou25_correct metrics['og_acc_iou50'] = iou50_correct metrics['og_acc_iou25_unique'] = iou25_unique_correct metrics['og_acc_iou50_unique'] = iou50_unique_correct metrics['og_acc_iou25_multiple'] = iou25_multiple_correct metrics['og_acc_iou50_multiple'] = iou50_multiple_correct metrics['og_acc'] = (og_pred == data_dict['tgt_object_id'].squeeze(1)).sum().item() if 'txt_cls_logits' in data_dict: metrics['txt_acc'] = (torch.argmax(data_dict['txt_cls_logits'], dim=1) == data_dict["tgt_object_label"].squeeze(1)).sum().item() # get obj cls acc gt = data_dict['obj_labels'] mask = data_dict['obj_masks'] for key in data_dict.keys(): if key.endswith('logits') and data_dict[key].ndim == 3 and data_dict[key].shape[:2] == data_dict['obj_labels'].shape: new_key = key.replace('logits', 'acc') pred = torch.argmax(data_dict[key], dim=2) metrics[new_key] = ((pred[mask] == gt[mask]).sum().item(), mask.sum().item()) for key in metrics: if isinstance(metrics[key], tuple): # already has count continue if 'unique' in key: metrics[key] = (metrics[key], unique_count) elif 'multiple' in key: metrics[key] = (metrics[key], multiple_count) else: metrics[key] = (metrics[key], total_count) if self.save: item_ids = data_dict['data_idx'] for i in range(len(item_ids)): self.eval_results.append({ "scene_id": item_ids[i], "bbox": data_dict['obj_boxes'][i][og_pred[i]].cpu().numpy().tolist(), "correct": og_pred[i].item() == data_dict['tgt_object_id'][i].item() }) if not include_count: for key, v in metrics.items(): metrics[key] = v[0] / max(v[1], 1) return metrics ================================================ FILE: evaluator/sqa3d_eval.py ================================================ import os import json import collections from pathlib import Path import numpy as np import torch from data.data_utils import SQA3DAnswer from evaluator.build import EVALUATOR_REGISTRY @EVALUATOR_REGISTRY.register() class SQA3DEval(): # 0: what, 1: is, 2: how, 3: can, 4: which, 5: others def __init__(self, cfg, task_name): self.eval_dict = { 'target_metric': [], 'obj_cls_raw_acc': [],'ans1_acc': [], 'ans10_acc': [], 'type0_acc': [], 'type1_acc': [], 'type2_acc': [], 'type0_acc': [], 'type1_acc': [], 'type2_acc': [], 'type3_acc': [], 'type4_acc': [], 'type5_acc': [] } # run self.total_count = 0 self.type_count = { 'type0_count': 1e-10, 'type1_count': 1e-10, 'type2_count': 1e-10, 'type3_count': 1e-10, 'type4_count': 1e-10, 'type5_count': 1e-10 } self.best_result = -np.inf self.base_dir = cfg.data.scan_family_base answer_data = json.load( open(os.path.join(self.base_dir, 'annotations/sqa_task/answer_dict.json'), encoding='utf-8') )[0] answer_counter = [] for data in answer_data.keys(): answer_counter.append(data) answer_counter = collections.Counter(sorted(answer_counter)) answer_cands = answer_counter.keys() self.answer_vocab = SQA3DAnswer(answer_cands) self.save = cfg.eval.save if self.save: self.eval_results = [] self.save_dir = Path(cfg.exp_dir) / "eval_results" / task_name self.save_dir.mkdir(parents=True, exist_ok=True) def update(self, data_dict): metrics = self.batch_metrics(data_dict) batch_count = metrics['total_count'] self.total_count += batch_count for key in metrics: if 'type' in key and 'count' in key: self.type_count[key] += metrics[key] if self.save: for i in range(metrics["total_count"]): self.eval_results.append({ # vision "source": data_dict['source'][i], "scan_id": data_dict['scan_id'][i], "anchor": data_dict['anchor_locs'][i], 'anchor_ort': data_dict['anchor_orientation'][i], # language "instruction": data_dict['prompt_after_obj'][i], "response_gt": data_dict['answer_list'][i].split('[answer_seq]'), "response_pred": data_dict['output_text'][i] }) # save eval dict for key in self.eval_dict.keys(): if 'type' in key: self.eval_dict[key].append(float(metrics[key]) * metrics['type' + key[4] + '_count']) else: self.eval_dict[key].append(float(metrics[key]) * batch_count) def batch_metrics(self, data_dict): metrics = {} # ans choice_1 = data_dict['answer_scores'].argmax(dim=-1) choice_10 = torch.topk(data_dict['answer_scores'].detach(), 10, -1)[1] correct1 = 0 correct10 = 0 correct_type = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0} count_type = {0: 1e-10, 1: 1e-10, 2: 1e-10, 3: 1e-10, 4: 1e-10, 5: 1e-10} for i in range(data_dict['answer_label'].shape[0]): count_type[data_dict['sqa_type'][i].item()] += 1 if data_dict['answer_label'][i, choice_1[i]] == 1: correct1 += 1 correct_type[data_dict['sqa_type'][i].item()] += 1 for j in range(10): if data_dict['answer_label'][i, choice_10[i, j]] == 1: correct10 += 1 break metrics['ans1_acc'] = correct1 / float(len(choice_1)) metrics['ans10_acc'] = correct10 / float(len(choice_1)) # metrics['answer_top10'] = [ # # TODO: add this answer vocabulary in dataloader # [self.answer_vocab.itos(choice_10[i, j].item()) for j in range(10)] for i in # range(choice_10.shape[0]) # ] metrics['obj_cls_raw_acc'] = torch.sum( torch.argmax(data_dict['obj_cls_raw_logits'], dim=2)[data_dict['obj_masks']] == data_dict["obj_labels"][ data_dict['obj_masks']]).item() / float(data_dict['obj_masks'].sum().item()) # question type acc for key in count_type.keys(): metrics['type' + str(key) + '_acc'] = correct_type[key] / count_type[key] metrics['type' + str(key) + '_count'] = count_type[key] metrics['target_metric'] = metrics['ans1_acc'] metrics["total_count"] = data_dict["answer_scores"].shape[0] return metrics def reset(self): for key in self.eval_dict.keys(): self.eval_dict[key] = [] self.total_count = 0 self.type_count = { 'type0_count': 1e-10, 'type1_count': 1e-10, 'type2_count': 1e-10, 'type3_count': 1e-10, 'type4_count': 1e-10, 'type5_count': 1e-10 } if self.save: self.eval_results = [] def record(self, split='val'): # record for k, v in self.eval_dict.items(): if k == "answer_top10": continue if 'type' in k: self.eval_dict[k] = sum(v) / self.type_count['type' + k[4] + '_count'] else: self.eval_dict[k] = sum(v) / self.total_count if self.eval_dict["target_metric"] > self.best_result: is_best = True self.best_result = self.eval_dict["target_metric"] else: is_best = False if self.save and (is_best or split == 'test'): torch.save(self.eval_results, str(self.save_dir / 'results.pt')) return is_best, self.eval_dict ================================================ FILE: launch.py ================================================ import argparse import common.launch_utils as lu def parse_args(): def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: return argparse.ArgumentTypeError('Unsupported value encountered') parser = argparse.ArgumentParser() # General settings parser.add_argument("--mode", default="submitit", type=str, help="Launch mode (submitit | accelerate | python)") parser.add_argument("--debug", default=False, type=str2bool, help="Debug mode (True | False)") # Slurm settings parser.add_argument("--name", default="masaccio", type=str, help="Name of the job") parser.add_argument("--run_file", default="run.py", type=str, help="File position of launcher file") parser.add_argument("--job_dir", default="jobs/%j", type=str, help="Directory to save the job logs") parser.add_argument("--num_nodes", default=1, type=int, help="Number of nodes to use in SLURM") parser.add_argument("--gpu_per_node", default=2, type=int, help="Number of gpus to use in each node") parser.add_argument("--cpu_per_task", default=32, type=int, help="Number of cpus to use for each gpu") parser.add_argument("--qos", default="level0", type=str, help="Qos of the job") parser.add_argument("--partition", default="gpu", type=str, help="Partition of the job") parser.add_argument("--account", default="research", type=str, help="Account of the job") parser.add_argument("--mem_per_gpu", default=80, type=int, help="Memory allocated for each gpu in GB") parser.add_argument("--time", default=24, type=int, help="Time allocated for the job in hours") parser.add_argument("--port", default=1234, type=int, help="Default port for distributed training") parser.add_argument("--nodelist", default="", type=str, help="Default node id for distributed training") # Accelerate settings parser.add_argument("--mixed_precision", default="no", type=str, help="Mixed precision training, options (no | fp16 | bf16)") # Additional Training settings parser.add_argument("--config", default="configs/default.yaml", type=str, help="Path to the config file") parser.add_argument("opts", default=None, nargs=argparse.REMAINDER, help="Additional options to change configureation") return parser.parse_args() def main(): args = parse_args() getattr(lu, f"{args.mode}_launch")(args) print("launched") if __name__ == "__main__": main() ================================================ FILE: model/__init__.py ================================================ from .objcls import * from .openvocab import * ================================================ FILE: model/build.py ================================================ import torch.nn as nn from fvcore.common.registry import Registry MODEL_REGISTRY = Registry("model") class BaseModel(nn.Module): def __init__(self, cfg): super().__init__() def get_opt_params(self): raise NotImplementedError("Function to obtain all default parameters for optimization") def build_model(cfg): model = MODEL_REGISTRY.get(cfg.model.name)(cfg) return model ================================================ FILE: model/objcls.py ================================================ import torch import torch.nn as nn import json from pathlib import Path import clip from transformers import BertConfig, BertModel, BertTokenizer from einops import rearrange from model.build import MODEL_REGISTRY, BaseModel from modules.layers.pointnet import PointNetPP from modules.utils import get_mlp_head from optim.utils import no_decay_param_group @MODEL_REGISTRY.register() class ObjCls(BaseModel): def __init__(self, cfg): super().__init__(cfg) self.cfg = cfg self.model_name = cfg.model.get("model_name", "pointnext") self.language_type = cfg.model.get("language_type", "clip") self.pre_extract_path = cfg.model.get("pre_extract_path", None) cls_in_channel = 512 if self.language_type == "clip" else 768 self.point_feature_extractor = PointNetPP( sa_n_points=[32, 16, None], sa_n_samples=[32, 32, None], sa_radii=[0.2, 0.4, None], sa_mlps=[[3, 64, 64, 128], [128, 128, 128, 256], [256, 256, 512, cls_in_channel]], ) if cfg.num_gpu > 1: self.point_feature_extractor = torch.nn.SyncBatchNorm.convert_sync_batchnorm(self.point_feature_extractor) if not cfg.model.open_vocab: cls_hidden = cfg.model.get("cls_hidden", 1024) num_classes = cfg.model.num_classes self.cls_head = get_mlp_head(cls_in_channel, cls_hidden, num_classes) else: if self.pre_extract_path is not None: file_name = f"scannet_607_{'clip-ViT-B16' if self.language_type == 'clip' else 'bert-base-uncased'}_id.pth" self.register_buffer("text_embeds", torch.load(Path(self.pre_extract_path) / file_name).float()) else: self.int2cat = json.load(open(cfg.model.vocab_path, "r")) if self.language_type == "clip": self.clip_head = clip.load("ViT-B/16") self.text_embeds = self.clip_head.encode_text(clip.tokenize(self.int2cat)).detach() elif self.language_type == "bert": self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) self.bert_config = BertConfig( hidden_size=768, num_hidden_layers=3, num_attention=12, type_vocab_size=2 ) self.model = BertModel.from_pretrained("bert-base-uncased", config=self.bert_config) self.encoded_input = self.tokenizer( self.int2cat, padding=True, truncation=True, add_special_tokens=True, return_tensors="pt" ) self.text_embeds = self.model(**self.encoded_input).last_hidden_state self.text_embeds = self.text_embeds.detach() else: raise NotImplementedError self.dropout = nn.Dropout(0.1) def forward(self, data_dict): # prepare dict if 'cur_step' not in data_dict.keys(): data_dict['cur_step'] = 1 data_dict['total_steps'] = 1 obj_pcds = data_dict["obj_fts"] batch_size, num_objs, _, _ = obj_pcds.size() if self.model_name == "pointnext": obj_locs = rearrange(obj_pcds[..., :3], 'b o p d -> (b o) p d') obj_fts = rearrange(obj_pcds[..., 3:], 'b o p d -> (b o) d p').contiguous() obj_embeds = self.point_feature_extractor(obj_locs, obj_fts, type="cls") elif self.model_name == "pointnet++": obj_pcds = rearrange(obj_pcds, 'b o p d -> (b o) p d') obj_embeds = self.point_feature_extractor(obj_pcds) elif self.model_name == "pointmlp": obj_pcds = rearrange(obj_pcds, 'b o p d -> (b o) p d') obj_embeds = self.point_feature_extractor(obj_pcds) obj_embeds = self.dropout(obj_embeds) if self.cfg.model.open_vocab: logits = obj_embeds @ self.text_embeds.t() data_dict["obj_logits"] = rearrange(logits, '(b o) c -> b o c', b=batch_size) else: data_dict["obj_logits"] = rearrange(self.cls_head(obj_embeds), '(b o) d -> b o d', b=batch_size) return data_dict def get_opt_params(self): optimizer_grouped_parameters = [] optimizer_grouped_parameters.append({ "params": self.parameters(), "weight_decay": self.cfg.solver.get("weight_decay", 0.0), "lr": self.cfg.solver.lr }) return optimizer_grouped_parameters ================================================ FILE: model/openvocab.py ================================================ import numpy as np import torch import torch.nn as nn from einops import einsum from model.build import MODEL_REGISTRY, BaseModel from modules.build import build_module from optim.utils import no_decay_param_group @MODEL_REGISTRY.register() class OpenVocab(BaseModel): def __init__(self, cfg): super().__init__(cfg) self.cfg = cfg self.lang_encoder = build_module("language", self.cfg.model.language) self.point_encoder = build_module("vision", self.cfg.model.vision) self.unified_encoder = build_module("grounding", self.cfg.model.grounding) self.head_list = self.cfg.model.heads.head_list for head in self.head_list: setattr(self, head, build_module("heads", getattr(self.cfg.model.heads, head))) self.use_scene_cap = self.cfg.data.args.get("use_scene_cap", False) if self.use_scene_cap: self.object_pool = lambda x : x.mean(dim=1) def forward(self, data_dict): # prepare dict if 'cur_step' not in data_dict.keys(): data_dict['cur_step'] = 1 data_dict['total_steps'] = 1 # basic feature extractor # point_features_pre_spatial is point features before spatial reasonging lang_basic_features = self.lang_encoder(data_dict['txt_ids'], data_dict['txt_masks']) if self.use_scene_cap: scene_txt_ids = data_dict['scene_txt_ids'] scene_txt_masks = data_dict['scene_txt_masks'] scene_lang_basic_features = self.lang_encoder(scene_txt_ids, scene_txt_masks) data_dict['scene_text_embed'] = scene_lang_basic_features[:, 0] if not "Scene" in self.cfg.model.vision.name: point_basic_features, point_features_pre, obj_cls_raw_logits = self.point_encoder(data_dict['obj_fts'].float(), data_dict['obj_locs'], data_dict['obj_masks'], data_dict['obj_sem_masks'], data_dict['obj_labels'], data_dict['cur_step'], data_dict['total_steps']) else: point_basic_features, point_features_pre, obj_cls_raw_logits = self.point_encoder(data_dict) if self.use_scene_cap: scene_feature = self.object_pool(point_basic_features) data_dict["scene_embed"] = scene_feature if self.cfg.model.inter == "before": data_dict["inter_text_embed"] = lang_basic_features[:, 0] data_dict["inter_obj_embeds"] = point_basic_features # unifed language entity transformer language_fuse_feature, point_fuse_feature = self.unified_encoder(lang_basic_features, data_dict['txt_masks'], point_basic_features, data_dict['obj_locs'], data_dict['obj_masks']) if self.cfg.model.inter != "before": data_dict["inter_text_embed"] = language_fuse_feature[:, 0] data_dict["inter_obj_embeds"] = point_fuse_feature # # TODO: check if this is correct and if an additional mlp head is needed language_summarize_feature = language_fuse_feature[:, 0] data_dict["intra_text_embed"] = language_summarize_feature data_dict["intra_obj_embeds"] = point_fuse_feature data_dict['obj_cls_raw_logits'] = obj_cls_raw_logits data_dict['og3d_logits'] = einsum(point_fuse_feature, language_summarize_feature, "b o d, b d -> b o") # task head if getattr(self, "ground_head", None) is not None: txt_cls_logits, obj_cls_post_logits, obj_cls_pre_logits, og3d_logits = self.ground_head(language_fuse_feature, point_fuse_feature, point_features_pre, data_dict['obj_masks']) data_dict['txt_cls_logits'] = txt_cls_logits data_dict['obj_cls_post_logits'] = obj_cls_post_logits data_dict['obj_cls_pre_logits'] = obj_cls_pre_logits # reload og3d_logits for head concatenated finetuning data_dict['og3d_logits'] = og3d_logits if getattr(self, "qa_head", None) is not None: answer_scores = self.qa_head(point_fuse_feature, data_dict['obj_masks'], language_fuse_feature, data_dict['txt_masks']) data_dict['answer_scores'] = answer_scores if getattr(self, "pretrain_head", None) is not None: output = self.pretrain_head(language_fuse_feature, point_fuse_feature) if isinstance(output, tuple): txt_lm_cls_logits, obj_lm_cls_logits = output data_dict['obj_cls_post_logits'] = obj_lm_cls_logits else: txt_lm_cls_logits = output data_dict['txt_lm_cls_logits'] = txt_lm_cls_logits return data_dict def get_opt_params(self): def get_lr(cfg, default_lr): return default_lr if cfg.get("lr") is None else cfg.get("lr") optimizer_grouped_parameters = [] optimizer_grouped_parameters += no_decay_param_group(self.lang_encoder.named_parameters(), get_lr(self.cfg.model.language, self.cfg.solver.lr)) optimizer_grouped_parameters += no_decay_param_group(self.point_encoder.named_parameters(), get_lr(self.cfg.model.vision, self.cfg.solver.lr)) optimizer_grouped_parameters += no_decay_param_group(self.unified_encoder.named_parameters(), get_lr(self.cfg.model.grounding, self.cfg.solver.lr)) if "ground_head" in self.head_list: optimizer_grouped_parameters += no_decay_param_group( self.ground_head.named_parameters(), get_lr(self.cfg.model.heads.ground_head, self.cfg.solver.lr) ) if "qa_head" in self.head_list: optimizer_grouped_parameters += no_decay_param_group( self.qa_head.named_parameters(), get_lr(self.cfg.model.heads.qa_head, self.cfg.solver.lr) ) if "pretrain_head" in self.head_list: optimizer_grouped_parameters += no_decay_param_group( self.pretrain_head.named_parameters(), get_lr(self.cfg.model.heads.pretrain_head, self.cfg.solver.lr) ) return optimizer_grouped_parameters @MODEL_REGISTRY.register() class OpenVocabPerScene(BaseModel): def __init__(self, cfg): super().__init__(cfg) self.cfg = cfg self.lang_encoder = build_module("language", self.cfg.model.language) self.point_encoder = build_module("vision", self.cfg.model.vision) self.unified_encoder = build_module("grounding", self.cfg.model.grounding) self.head_list = self.cfg.model.heads.head_list for head in self.head_list: setattr(self, head, build_module("heads", getattr(self.cfg.model.heads, head))) def forward(self, data_dict): # prepare dict if 'cur_step' not in data_dict.keys(): data_dict['cur_step'] = 1 data_dict['total_steps'] = 1 use_per_scene = (len(data_dict['txt_ids'].shape) == 3) if use_per_scene: B, L, _ = data_dict['txt_ids'].shape B, O = data_dict['obj_masks'].shape # basic feature extracter # point_features_pre_spatial is point features before spatial reasonging txt_ids = data_dict['txt_ids'].view(B * L, -1) if use_per_scene else data_dict['txt_ids'] txt_masks = data_dict['txt_masks'].view(B * L, -1) if use_per_scene else data_dict['txt_masks'] lang_basic_features = self.lang_encoder(txt_ids, txt_masks) # (B, L), D if not "Scene" in self.cfg.model.vision.name: point_basic_features, point_features_pre, obj_cls_raw_logits = self.point_encoder(data_dict['obj_fts'].float(), data_dict['obj_locs'], data_dict['obj_masks'], data_dict['obj_sem_masks'], data_dict['obj_labels'], data_dict['cur_step'], data_dict['total_steps']) else: point_basic_features, point_features_pre, obj_cls_raw_logits = self.point_encoder(data_dict) point_basic_features = point_basic_features.unsqueeze(1).repeat(1, L, 1, 1) \ if use_per_scene else point_basic_features point_basic_features = point_basic_features.view(B * L, O, point_basic_features.shape[-1]) \ if use_per_scene else point_basic_features if use_per_scene: obj_locs = data_dict['obj_locs'].unsqueeze(1).repeat(1, L, 1, 1) obj_locs = obj_locs.view(B * L, O, obj_locs.shape[-1]) obj_masks = data_dict['obj_masks'].unsqueeze(1).repeat(1, L, 1) obj_masks = obj_masks.view(B * L, O) else: obj_locs = data_dict['obj_locs'] obj_masks = data_dict['obj_masks'] if self.cfg.model.inter == "before": data_dict["inter_text_embed"] = lang_basic_features[:, 0] data_dict["inter_obj_embeds"] = point_basic_features # unifed language entity transformer language_fuse_feature, point_fuse_feature = self.unified_encoder(lang_basic_features, txt_masks, point_basic_features, obj_locs, obj_masks) if self.cfg.model.inter != "before": data_dict["inter_text_embed"] = language_fuse_feature[:, 0] data_dict["inter_obj_embeds"] = point_fuse_feature # # TODO: check if this is correct and if an additional mlp head is needed language_summarize_feature = language_fuse_feature[:, 0] data_dict["intra_text_embed"] = language_summarize_feature data_dict["intra_obj_embeds"] = point_fuse_feature data_dict['obj_cls_raw_logits'] = obj_cls_raw_logits data_dict['og3d_logits'] = einsum(point_fuse_feature, language_summarize_feature, "b o d, b d -> b o") if use_per_scene: data_dict['og3d_logits'] = data_dict['og3d_logits'].view(B, L, O) # # task head # if getattr(self, "ground_head", None) is not None: # txt_cls_logits, obj_cls_post_logits, obj_cls_pre_logits, og3d_logits = self.ground_head(language_fuse_feature, # point_fuse_feature, # point_features_pre, # data_dict['obj_masks']) # data_dict['txt_cls_logits'] = txt_cls_logits # data_dict['obj_cls_post_logits'] = obj_cls_post_logits # data_dict['obj_cls_pre_logits'] = obj_cls_pre_logits # # reload og3d_logits for head concatenated finetuning # data_dict['og3d_logits'] = og3d_logits # if getattr(self, "qa_head", None) is not None: answer_scores = self.qa_head(point_fuse_feature, data_dict['obj_masks'], language_fuse_feature, data_dict['txt_masks']) data_dict['answer_scores'] = answer_scores if getattr(self, "pretrain_head", None) is not None: output = self.pretrain_head(language_fuse_feature, point_fuse_feature) if isinstance(output, tuple): txt_lm_cls_logits, obj_lm_cls_logits = output data_dict['obj_cls_post_logits'] = obj_lm_cls_logits else: txt_lm_cls_logits = output data_dict['txt_lm_cls_logits'] = txt_lm_cls_logits return data_dict def get_opt_params(self): def get_lr(cfg, default_lr): return default_lr if cfg.get("lr") is None else cfg.get("lr") optimizer_grouped_parameters = [] optimizer_grouped_parameters += no_decay_param_group(self.lang_encoder.named_parameters(), get_lr(self.cfg.model.language, self.cfg.solver.lr)) optimizer_grouped_parameters += no_decay_param_group(self.point_encoder.named_parameters(), get_lr(self.cfg.model.vision, self.cfg.solver.lr)) optimizer_grouped_parameters += no_decay_param_group(self.unified_encoder.named_parameters(), get_lr(self.cfg.model.grounding, self.cfg.solver.lr)) if "ground_head" in self.head_list: optimizer_grouped_parameters += no_decay_param_group( self.ground_head.named_parameters(), get_lr(self.cfg.model.heads.ground_head, self.cfg.solver.lr) ) if "qa_head" in self.head_list: optimizer_grouped_parameters += no_decay_param_group( self.qa_head.named_parameters(), get_lr(self.cfg.model.heads.qa_head, self.cfg.solver.lr) ) if "pretrain_head" in self.head_list: optimizer_grouped_parameters += no_decay_param_group( self.pretrain_head.named_parameters(), get_lr(self.cfg.model.heads.pretrain_head, self.cfg.solver.lr) ) return optimizer_grouped_parameters ================================================ FILE: modules/__init__.py ================================================ from .language import * from .vision import * from .grounding import * from .heads import * ================================================ FILE: modules/build.py ================================================ from fvcore.common.registry import Registry from common.type_utils import cfg2dict VISION_REGISTRY = Registry("vision") LANGUAGE_REGISTRY = Registry("language") GROUNDING_REGISTRY = Registry("grounding") HEADS_REGISTRY = Registry("heads") def build_module(module_type, cfg): if module_type == "vision": return VISION_REGISTRY.get(cfg.name)(cfg, **cfg2dict(cfg.args)) elif module_type == "language": return LANGUAGE_REGISTRY.get(cfg.name)(cfg, **cfg2dict(cfg.args)) elif module_type == "grounding": return GROUNDING_REGISTRY.get(cfg.name)(cfg, **cfg2dict(cfg.args)) elif module_type == "heads": return HEADS_REGISTRY.get(cfg.name)(cfg, **cfg2dict(cfg.args)) else: raise NotImplementedError(f"module type {module_type} not implemented") def build_module_by_name(cfg): module_registries = [VISION_REGISTRY, LANGUAGE_REGISTRY, GROUNDING_REGISTRY, HEADS_REGISTRY] for registry in module_registries: if cfg.name in registry: print(f"Using {cfg.name} module from Registry {registry._name}") kwargs = cfg2dict(cfg.args) if hasattr(cfg, "args") else {} return registry.get(cfg.name)(cfg, **kwargs) raise NotImplementedError(f"Unknown module: {cfg.name}") ================================================ FILE: modules/grounding/__init__.py ================================================ from .unified_encoder import * ================================================ FILE: modules/grounding/unified_encoder.py ================================================ import torch import torch.nn as nn from modules.build import GROUNDING_REGISTRY from modules.layers.transformers import (TransformerDecoderLayer, TransformerEncoderLayer, TransformerSpatialDecoderLayer) from modules.utils import layer_repeat, calc_pairwise_locs from modules.weights import _init_weights_bert @GROUNDING_REGISTRY.register() class EntitySpatialCrossEncoder(nn.Module): """ spatial_dim: spatial feature dim, used to modify attention dim_loc: """ def __init__(self, cfg, hidden_size=768, num_attention_heads=12, spatial_dim=5, num_layers=4, dim_loc=6, pairwise_rel_type='center'): super().__init__() decoder_layer = TransformerSpatialDecoderLayer(hidden_size, num_attention_heads, dim_feedforward=2048, dropout=0.1, activation='gelu', spatial_dim=spatial_dim, spatial_multihead=True, spatial_attn_fusion='cond') self.layers = layer_repeat(decoder_layer, num_layers) loc_layer = nn.Sequential( nn.Linear(dim_loc, hidden_size), nn.LayerNorm(hidden_size), ) self.loc_layers = layer_repeat(loc_layer, 1) self.pairwise_rel_type = pairwise_rel_type self.spatial_dim = spatial_dim self.spatial_dist_norm = True self.apply(_init_weights_bert) def forward( self, txt_embeds, txt_masks, obj_embeds, obj_locs, obj_masks, output_attentions=False, output_hidden_states=False, **kwargs ): pairwise_locs = calc_pairwise_locs( obj_locs[:, :, :3], obj_locs[:, :, 3:], pairwise_rel_type=self.pairwise_rel_type ) out_embeds = obj_embeds for i, layer in enumerate(self.layers): query_pos = self.loc_layers[0](obj_locs) out_embeds = out_embeds + query_pos out_embeds, self_attn_matrices, cross_attn_matrices = layer( out_embeds, txt_embeds, pairwise_locs, tgt_key_padding_mask=obj_masks.logical_not(), memory_key_padding_mask=txt_masks.logical_not(), ) return txt_embeds, out_embeds @GROUNDING_REGISTRY.register() class UnifiedSpatialCrossEncoderV1(nn.Module): """ spatial_dim: spatial feature dim, used to modify attention dim_loc: """ def __init__(self, cfg, hidden_size=768, num_attention_heads=12, spatial_dim=5, num_layers=4, dim_loc=6, pairwise_rel_type='center'): super().__init__() pc_encoder_layer = TransformerSpatialDecoderLayer(hidden_size, num_attention_heads, dim_feedforward=2048, dropout=0.1, activation='gelu', spatial_dim=spatial_dim, spatial_multihead=True, spatial_attn_fusion='cond') lang_encoder_layer = TransformerDecoderLayer(hidden_size, num_attention_heads) self.pc_encoder = layer_repeat(pc_encoder_layer, num_layers) self.lang_encoder = layer_repeat(lang_encoder_layer, num_layers) loc_layer = nn.Sequential( nn.Linear(dim_loc, hidden_size), nn.LayerNorm(hidden_size), ) self.loc_layers = layer_repeat(loc_layer, 1) self.pairwise_rel_type = pairwise_rel_type self.spatial_dim = spatial_dim self.spatial_dist_norm = True self.apply(_init_weights_bert) def forward( self, txt_embeds, txt_masks, obj_embeds, obj_locs, obj_masks, output_attentions=False, output_hidden_states=False, **kwargs ): pairwise_locs = calc_pairwise_locs( obj_locs[:, :, :3], obj_locs[:, :, 3:], pairwise_rel_type=self.pairwise_rel_type ) for i, (pc_layer, lang_layer) in enumerate(zip(self.pc_encoder, self.lang_encoder)): query_pos = self.loc_layers[0](obj_locs) obj_embeds = obj_embeds + query_pos obj_embeds_out, self_attn_matrices, cross_attn_matrices = pc_layer( obj_embeds, txt_embeds, pairwise_locs, tgt_key_padding_mask=obj_masks.logical_not(), memory_key_padding_mask=txt_masks.logical_not(), ) txt_embeds_out, self_attn_matrices, cross_attn_matrices = lang_layer( txt_embeds, obj_embeds, tgt_key_padding_mask=txt_masks.logical_not(), memory_key_padding_mask=obj_masks.logical_not(), ) obj_embeds = obj_embeds_out txt_embeds = txt_embeds_out return txt_embeds, obj_embeds @GROUNDING_REGISTRY.register() class UnifiedSpatialCrossEncoderV2(nn.Module): """ spatial_dim: spatial feature dim, used to modify attention dim_loc: """ def __init__(self, cfg, hidden_size=768, dim_feedforward=2048, num_attention_heads=12, num_layers=4, dim_loc=6): super().__init__() # unfied encoder unified_encoder_layer = TransformerEncoderLayer(hidden_size, num_attention_heads, dim_feedforward=dim_feedforward) self.unified_encoder = layer_repeat(unified_encoder_layer, num_layers) # loc layer loc_layer = nn.Sequential( nn.Linear(dim_loc, hidden_size), nn.LayerNorm(hidden_size), ) self.loc_layers = layer_repeat(loc_layer, 1) # token embedding self.token_type_embeddings = nn.Embedding(2, hidden_size) self.apply(_init_weights_bert) def forward( self, txt_embeds, txt_masks, obj_embeds, obj_locs, obj_masks, output_attentions=False, output_hidden_states=False, **kwargs ): txt_len = txt_embeds.shape[1] obj_len = obj_embeds.shape[1] for i, unified_layer in enumerate(self.unified_encoder): # add embeddings for points query_pos = self.loc_layers[0](obj_locs) pc_token_type_ids = torch.ones((obj_embeds.shape[0:2])).long().cuda() pc_type_embeds = self.token_type_embeddings(pc_token_type_ids) obj_embeds = obj_embeds + query_pos + pc_type_embeds # add embeddings for languages lang_token_type_ids = torch.zeros((txt_embeds.shape[0:2])).long().cuda() lang_type_embeds = self.token_type_embeddings(lang_token_type_ids) txt_embeds = txt_embeds + lang_type_embeds # fuse embeddings joint_embeds = torch.cat((txt_embeds, obj_embeds), dim=1) joint_masks = torch.cat((txt_masks, obj_masks), dim=1) # transformer joint_embeds, self_attn_matrices = unified_layer(joint_embeds, tgt_key_padding_mask=joint_masks.logical_not()) # split txt_embeds, obj_embeds = torch.split(joint_embeds, [txt_len, obj_len], dim=1) return txt_embeds, obj_embeds if __name__ == '__main__': x = UnifiedSpatialCrossEncoderV2().cuda() txt_embeds = torch.zeros((3, 10, 768)).cuda() txt_masks = torch.ones((3, 10)).cuda() obj_embeds = torch.zeros((3, 10, 768)).cuda() obj_locs = torch.ones((3, 10, 6)).cuda() obj_masks = torch.ones((3, 10)).cuda() x(txt_embeds, txt_masks, obj_embeds, obj_locs, obj_masks) ================================================ FILE: modules/heads/__init__.py ================================================ from .grounding_head import * from .pretrain_head import * from .qa_head import * ================================================ FILE: modules/heads/grounding_head.py ================================================ import torch.nn as nn from modules.build import HEADS_REGISTRY from modules.utils import get_mlp_head @HEADS_REGISTRY.register() class GroundHeadV1(nn.Module): def __init__(self, cfg, input_size=768, hidden_size=768, sem_cls_size=607, dropout=0.3, detach_all_aux_loss=False): super().__init__() self.og3d_head = get_mlp_head( input_size, hidden_size, 1, dropout=dropout ) self.txt_clf_head = get_mlp_head( input_size, hidden_size, sem_cls_size, dropout=dropout ) self.obj3d_clf_head = get_mlp_head( input_size, hidden_size, sem_cls_size, dropout=dropout ) self.obj3d_clf_pre_head = get_mlp_head( input_size, hidden_size, sem_cls_size, dropout=dropout ) self.detach_all_aux_loss = detach_all_aux_loss def forward(self, txt_embeds, obj_embeds, obj_pre_embeds, obj_masks, **kwargs): og3d_logits = self.og3d_head(obj_embeds).squeeze(2) og3d_logits = og3d_logits.masked_fill_(obj_masks.logical_not(), -float('inf')) if self.detach_all_aux_loss: txt_embeds = txt_embeds.detach() obj_embeds = obj_embeds.detach() obj_pre_embeds = obj_pre_embeds.detach() txt_cls_logits = self.txt_clf_head(txt_embeds[:, 0]) obj_cls_logits = self.obj3d_clf_head(obj_embeds) obj_cls_pre_logits = self.obj3d_clf_pre_head(obj_pre_embeds) return txt_cls_logits, obj_cls_logits, obj_cls_pre_logits, og3d_logits @HEADS_REGISTRY.register() class GroundHead(nn.Module): def __init__(self, cfg, input_size=768, hidden_size=768, dropout=0.3): super().__init__() self.og3d_head = get_mlp_head( input_size, hidden_size, 1, dropout=dropout ) def forward(self, obj_embeds, obj_masks=None, **kwargs): og3d_logits = self.og3d_head(obj_embeds).squeeze(2) if obj_masks is not None: og3d_logits = og3d_logits.masked_fill_(obj_masks.logical_not(), -float('inf')) return og3d_logits ================================================ FILE: modules/heads/pretrain_head.py ================================================ import torch import torch.nn as nn from modules.build import HEADS_REGISTRY from modules.utils import get_activation_fn class BertPredictionHeadTransform(nn.Module): def __init__(self, hidden_size, hidden_act='gelu'): super().__init__() self.dense = nn.Linear(hidden_size, hidden_size) self.transform_act_fn = get_activation_fn(hidden_act) self.LayerNorm = nn.LayerNorm(hidden_size) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, hidden_size, vocab_size): super().__init__() self.transform = BertPredictionHeadTransform(hidden_size=hidden_size, hidden_act='gelu') self.decoder = nn.Linear(hidden_size, vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(vocab_size)) def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) + self.bias return hidden_states @HEADS_REGISTRY.register() class PretrainHeadV1(nn.Module): def __init__(self, cfg, hidden_size=768, vocab_size=30522): super().__init__() self.lm_pred_head = BertLMPredictionHead(hidden_size, vocab_size) def forward(self, txt_embeds, **kwargs): txt_lm_cls_logits = self.lm_pred_head(txt_embeds) return txt_lm_cls_logits @HEADS_REGISTRY.register() class OVPretrainHead(nn.Module): def __init__(self, cfg, hidden_size=768, vocab_size=30522, obj_vocab_size=607): super().__init__() self.lm_pred_head = BertLMPredictionHead(hidden_size, vocab_size) self.obj_pred_head = BertLMPredictionHead(hidden_size, obj_vocab_size) def forward(self, txt_embeds, obj_embeds, **kwargs): txt_lm_cls_logits = self.lm_pred_head(txt_embeds) obj_lm_cls_logits = self.obj_pred_head(obj_embeds) return (txt_lm_cls_logits, obj_lm_cls_logits) ================================================ FILE: modules/heads/qa_head.py ================================================ import torch import torch.nn.functional as F from torch import nn from modules.build import HEADS_REGISTRY class FC(nn.Module): def __init__(self, in_size, out_size, pdrop=0., use_gelu=True): super(FC, self).__init__() self.pdrop = pdrop self.use_gelu = use_gelu self.linear = nn.Linear(in_size, out_size) if use_gelu: # self.relu = nn.Relu(inplace=True) self.gelu = nn.GELU() if pdrop > 0: self.dropout = nn.Dropout(pdrop) def forward(self, x): x = self.linear(x) if self.use_gelu: # x = self.relu(x) x = self.gelu(x) if self.pdrop > 0: x = self.dropout(x) return x class MLP(nn.Module): def __init__(self, in_size, mid_size, out_size, pdrop=0., use_gelu=True): super().__init__() self.fc = FC(in_size, mid_size, pdrop=pdrop, use_gelu=use_gelu) self.linear = nn.Linear(mid_size, out_size) def forward(self, x): return self.linear(self.fc(x)) class AttFlat(nn.Module): def __init__(self, hidden_size, flat_mlp_size=512, flat_glimpses=1, flat_out_size=1024, pdrop=0.1): super().__init__() self.mlp = MLP( in_size=hidden_size, mid_size=flat_mlp_size, out_size=flat_glimpses, pdrop=pdrop, use_gelu=True ) self.flat_glimpses = flat_glimpses self.linear_merge = nn.Linear( hidden_size * flat_glimpses, flat_out_size ) def forward(self, x, x_mask): att = self.mlp(x) if x_mask is not None: # att = att.masked_fill(x_mask.squeeze(1).squeeze(1).unsqueeze(2), -1e9) att = att.masked_fill(x_mask.unsqueeze(2), -1e9) att = F.softmax(att, dim=1) att_list = [] for i in range(self.flat_glimpses): att_list.append( torch.sum(att[:, :, i: i + 1] * x, dim=1) ) x_atted = torch.cat(att_list, dim=1) x_atted = self.linear_merge(x_atted) return x_atted @HEADS_REGISTRY.register() class QAHeadV1(nn.Module): def __init__(self, cfg, hidden_size=768, mlp_size=256, glimpse=1, flat_out_size=512, num_answers=8864): super().__init__() self.attflat_visual = AttFlat(hidden_size, mlp_size, glimpse, flat_out_size, 0.1) self.attflat_lang = AttFlat(hidden_size, mlp_size, glimpse, flat_out_size, 0.1) self.answer_cls = nn.Sequential( nn.Linear(flat_out_size, hidden_size), nn.GELU(), nn.Dropout(0.3), nn.Linear(hidden_size, num_answers) ) self.fusion_norm = nn.LayerNorm(flat_out_size) def forward(self, obj_embeds, obj_masks, txt_embeds, txt_masks, **kwargs): object_feat = self.attflat_visual(obj_embeds, obj_masks.logical_not()) lang_feat = self.attflat_lang(txt_embeds, txt_masks.logical_not()) fuse_feat = self.fusion_norm(lang_feat + object_feat) answer_scores = self.answer_cls(fuse_feat) return answer_scores ================================================ FILE: modules/language/__init__.py ================================================ from .bert import * from .clip import * ================================================ FILE: modules/language/bert.py ================================================ import torch.nn as nn from transformers import BertConfig, BertModel, BertTokenizer from modules.build import LANGUAGE_REGISTRY @LANGUAGE_REGISTRY.register() class BERTLanguageEncoder(nn.Module): def __init__(self, cfg, weights="bert-base-uncased", hidden_size=768, num_hidden_layers=4, num_attention_heads=12, type_vocab_size=2): super().__init__() self.tokenizer = BertTokenizer.from_pretrained( weights, do_lower_case=True ) self.bert_config = BertConfig( hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, type_vocab_size=type_vocab_size ) self.model = BertModel.from_pretrained( weights, config=self.bert_config ) def forward(self, txt_ids, txt_masks, **kwargs): return self.model(txt_ids, txt_masks).last_hidden_state ================================================ FILE: modules/language/clip.py ================================================ from contextlib import nullcontext import torch import torch.nn as nn from transformers import CLIPTextModelWithProjection from modules.build import LANGUAGE_REGISTRY from modules.utils import get_mlp_head @LANGUAGE_REGISTRY.register() class CLIPLanguageEncoder(nn.Module): def __init__(self, cfg, weights="openai/clip-vit-large-patch14", output_dim=768, freeze_backbone=True, use_projection=False, dropout=0.1): super().__init__() self.context = torch.no_grad if freeze_backbone else nullcontext self.model = CLIPTextModelWithProjection.from_pretrained(weights) self.use_projection = use_projection if use_projection: self.projection = get_mlp_head(self.model.config.hidden_size, output_dim, output_dim, dropout=dropout) #self.attention = nn.MultiheadAttention(embed_dim=768, num_heads=12, batch_first=True) def forward(self, txt_ids, txt_masks): with self.context(): txt = self.model(txt_ids, txt_masks).last_hidden_state txt = self.model.text_projection(txt) txt = torch.nn.functional.normalize(txt, p=2, dim=2) #txt = self.attention(txt, txt, txt, key_padding_mask=txt_masks.logical_not())[0] if self.use_projection: txt = self.projection(txt) return txt ================================================ FILE: modules/layers/pointnet.py ================================================ import torch.nn as nn from modules.third_party.pointnet2.pointnet2_modules import PointnetSAModule def break_up_pc(pc): """ Split the pointcloud into xyz positions and features tensors. This method is taken from VoteNet codebase (https://github.com/facebookresearch/votenet) @param pc: pointcloud [N, 3 + C] :return: the xyz tensor and the feature tensor """ xyz = pc[..., 0:3].contiguous() features = ( pc[..., 3:].transpose(1, 2).contiguous() if pc.size(-1) > 3 else None ) return xyz, features class PointNetPP(nn.Module): """ Pointnet++ encoder. For the hyper parameters please advise the paper (https://arxiv.org/abs/1706.02413) """ def __init__(self, sa_n_points: list, sa_n_samples: list, sa_radii: list, sa_mlps: list, bn=True, use_xyz=True): super().__init__() n_sa = len(sa_n_points) if not (n_sa == len(sa_n_samples) == len(sa_radii) == len(sa_mlps)): raise ValueError('Lens of given hyper-params are not compatible') self.encoder = nn.ModuleList() for i in range(n_sa): self.encoder.append(PointnetSAModule( npoint=sa_n_points[i], nsample=sa_n_samples[i], radius=sa_radii[i], mlp=sa_mlps[i], bn=bn, use_xyz=use_xyz, )) out_n_points = sa_n_points[-1] if sa_n_points[-1] is not None else 1 self.fc = nn.Linear(out_n_points * sa_mlps[-1][-1], sa_mlps[-1][-1]) def forward(self, features): """ @param features: B x N_objects x N_Points x 3 + C """ xyz, features = break_up_pc(features) for i in range(len(self.encoder)): xyz, features = self.encoder[i](xyz, features) return self.fc(features.view(features.size(0), -1)) ================================================ FILE: modules/layers/transformers.py ================================================ from typing import Optional import einops import numpy as np import torch import torch.nn.functional as F from torch import Tensor, nn from modules.utils import get_activation_fn class CrossAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", k_dim=None, v_dim=None, prenorm=True): super().__init__() if k_dim is None: k_dim = d_model if v_dim is None: v_dim = d_model self.prenorm = prenorm self.multihead_attn = nn.MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=True, kdim=k_dim, vdim=v_dim ) # Implementation of Feedforward modules self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = get_activation_fn(activation) def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, ): tgt2 = tgt if self.prenorm: tgt2 = self.norm1(tgt2) tgt2, cross_attn_matrices = self.multihead_attn( query=tgt2, key=memory, value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask ) tgt = tgt + self.dropout2(tgt2) if not self.prenorm: tgt = self.norm1(tgt) if self.prenorm: tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) if not self.prenorm: tgt = self.norm3(tgt) return tgt, cross_attn_matrices class TransformerDecoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super().__init__() self.self_attn = nn.MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=True ) self.multihead_attn = nn.MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=True ) # Implementation of Feedforward modules self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = get_activation_fn(activation) def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, ): tgt2 = self.norm1(tgt) tgt2, self_attn_matrices = self.self_attn( query=tgt2, key=tgt2, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask ) tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2, cross_attn_matrices = self.multihead_attn( query=tgt2, key=memory, value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask ) tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) return tgt, self_attn_matrices, cross_attn_matrices class TransformerEncoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, batch_first=True, dropout=0.1, activation="relu", prenorm=False): super().__init__() self.self_attn = nn.MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=batch_first ) # Implementation of Feedforward modules self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = get_activation_fn(activation) self.prenorm = prenorm def forward( self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, ): tgt2 = tgt if self.prenorm: tgt2 = self.norm1(tgt2) tgt2, self_attn_matrices = self.self_attn( query=tgt2, key=tgt2, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask ) tgt = tgt + self.dropout1(tgt2) if not self.prenorm: tgt = self.norm1(tgt) if self.prenorm: tgt = self.norm2(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout2(tgt2) if not self.prenorm: tgt = self.norm2(tgt) return tgt, self_attn_matrices class MultiHeadAttentionSpatial(nn.Module): def __init__( self, d_model, n_head, dropout=0.1, spatial_multihead=True, spatial_dim=5, spatial_attn_fusion='mul', ): super().__init__() assert d_model % n_head == 0, 'd_model: %d, n_head: %d' % (d_model, n_head) self.n_head = n_head self.d_model = d_model self.d_per_head = d_model // n_head self.spatial_multihead = spatial_multihead self.spatial_dim = spatial_dim self.spatial_attn_fusion = spatial_attn_fusion self.w_qs = nn.Linear(d_model, d_model) self.w_ks = nn.Linear(d_model, d_model) self.w_vs = nn.Linear(d_model, d_model) self.fc = nn.Linear(d_model, d_model) self.spatial_n_head = n_head if spatial_multihead else 1 if self.spatial_attn_fusion in ['mul', 'bias', 'add']: self.pairwise_loc_fc = nn.Linear(spatial_dim, self.spatial_n_head) elif self.spatial_attn_fusion == 'ctx': self.pairwise_loc_fc = nn.Linear(spatial_dim, d_model) elif self.spatial_attn_fusion == 'cond': self.lang_cond_fc = nn.Linear(d_model, self.spatial_n_head * (spatial_dim + 1)) else: raise NotImplementedError('unsupported spatial_attn_fusion %s' % (self.spatial_attn_fusion)) def forward(self, q, k, v, pairwise_locs, key_padding_mask=None, txt_embeds=None): residual = q q = einops.rearrange(self.w_qs(q), 'b l (head k) -> head b l k', head=self.n_head) k = einops.rearrange(self.w_ks(k), 'b t (head k) -> head b t k', head=self.n_head) v = einops.rearrange(self.w_vs(v), 'b t (head v) -> head b t v', head=self.n_head) attn = torch.einsum('hblk,hbtk->hblt', q, k) / np.sqrt(q.shape[-1]) if self.spatial_attn_fusion in ['mul', 'bias', 'add']: loc_attn = self.pairwise_loc_fc(pairwise_locs) loc_attn = einops.rearrange(loc_attn, 'b l t h -> h b l t') if self.spatial_attn_fusion == 'mul': loc_attn = F.relu(loc_attn) if not self.spatial_multihead: loc_attn = einops.repeat(loc_attn, 'h b l t -> (h nh) b l t', nh=self.n_head) elif self.spatial_attn_fusion == 'ctx': loc_attn = self.pairwise_loc_fc(pairwise_locs) loc_attn = einops.rearrange(loc_attn, 'b l t (h k) -> h b l t k', h=self.n_head) loc_attn = torch.einsum('hblk,hbltk->hblt', q, loc_attn) / np.sqrt(q.shape[-1]) elif self.spatial_attn_fusion == 'cond': spatial_weights = self.lang_cond_fc(residual) spatial_weights = einops.rearrange(spatial_weights, 'b l (h d) -> h b l d', h=self.spatial_n_head, d=self.spatial_dim + 1) if self.spatial_n_head == 1: spatial_weights = einops.repeat(spatial_weights, '1 b l d -> h b l d', h=self.n_head) spatial_bias = spatial_weights[..., :1] spatial_weights = spatial_weights[..., 1:] loc_attn = torch.einsum('hbld,bltd->hblt', spatial_weights, pairwise_locs) + spatial_bias loc_attn = torch.sigmoid(loc_attn) if key_padding_mask is not None: mask = einops.repeat(key_padding_mask, 'b t -> h b l t', h=self.n_head, l=q.size(2)) attn = attn.masked_fill(mask, -np.inf) if self.spatial_attn_fusion in ['mul', 'cond']: loc_attn = loc_attn.masked_fill(mask, 0) else: loc_attn = loc_attn.masked_fill(mask, -np.inf) if self.spatial_attn_fusion == 'add': fused_attn = (torch.softmax(attn, 3) + torch.softmax(loc_attn, 3)) / 2 else: if self.spatial_attn_fusion in ['mul', 'cond']: fused_attn = torch.log(torch.clamp(loc_attn, min=1e-6)) + attn else: fused_attn = loc_attn + attn fused_attn = torch.softmax(fused_attn, 3) assert torch.sum(torch.isnan(fused_attn) == 0), print(fused_attn) output = torch.einsum('hblt,hbtv->hblv', fused_attn, v) output = einops.rearrange(output, 'head b l v -> b l (head v)') output = self.fc(output) return output, fused_attn class TransformerSpatialDecoderLayer(TransformerDecoderLayer): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", spatial_multihead=True, spatial_dim=5, spatial_attn_fusion='mul' ): super().__init__( d_model, nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation ) del self.self_attn self.self_attn = MultiHeadAttentionSpatial( d_model, nhead, dropout=dropout, spatial_multihead=spatial_multihead, spatial_dim=spatial_dim, spatial_attn_fusion=spatial_attn_fusion, ) def forward( self, tgt, memory, tgt_pairwise_locs: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, ): tgt2 = self.norm1(tgt) tgt2, self_attn_matrices = self.self_attn( tgt2, tgt2, tgt2, tgt_pairwise_locs, key_padding_mask=tgt_key_padding_mask ) tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2, cross_attn_matrices = self.multihead_attn( query=tgt2, key=memory, value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask ) tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) return tgt, self_attn_matrices, cross_attn_matrices class TransformerSpatialEncoderLayer(TransformerEncoderLayer): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", spatial_multihead=True, spatial_dim=5, spatial_attn_fusion='mul' ): super().__init__( d_model, nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation ) del self.self_attn self.self_attn = MultiHeadAttentionSpatial( d_model, nhead, dropout=dropout, spatial_multihead=spatial_multihead, spatial_dim=spatial_dim, spatial_attn_fusion=spatial_attn_fusion, ) def forward( self, tgt, tgt_pairwise_locs, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, ): tgt2 = tgt tgt2, self_attn_matrices = self.self_attn( tgt2, tgt2, tgt2, tgt_pairwise_locs, key_padding_mask=tgt_key_padding_mask ) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) return tgt, self_attn_matrices ================================================ FILE: modules/third_party/__init__.py ================================================ ================================================ FILE: modules/third_party/pointnet2/_ext_src/include/ball_query.h ================================================ #pragma once #include at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, const int nsample); ================================================ FILE: modules/third_party/pointnet2/_ext_src/include/cuda_utils.h ================================================ #ifndef _CUDA_UTILS_H #define _CUDA_UTILS_H #include #include #include #include #include #include #define TOTAL_THREADS 512 inline int opt_n_threads(int work_size) { const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); return max(min(1 << pow_2, TOTAL_THREADS), 1); } inline dim3 opt_block_config(int x, int y) { const int x_threads = opt_n_threads(x); const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); dim3 block_config(x_threads, y_threads, 1); return block_config; } #define CUDA_CHECK_ERRORS() \ do { \ cudaError_t err = cudaGetLastError(); \ if (cudaSuccess != err) { \ fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \ __FILE__); \ exit(-1); \ } \ } while (0) #endif ================================================ FILE: modules/third_party/pointnet2/_ext_src/include/group_points.h ================================================ #pragma once #include at::Tensor group_points(at::Tensor points, at::Tensor idx); at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); ================================================ FILE: modules/third_party/pointnet2/_ext_src/include/interpolate.h ================================================ #pragma once #include #include std::vector three_nn(at::Tensor unknowns, at::Tensor knows); at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weight); at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, const int m); ================================================ FILE: modules/third_party/pointnet2/_ext_src/include/sampling.h ================================================ #pragma once #include at::Tensor gather_points(at::Tensor points, at::Tensor idx); at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples); ================================================ FILE: modules/third_party/pointnet2/_ext_src/include/utils.h ================================================ #pragma once #include #include #define CHECK_CUDA(x) \ do { \ AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \ } while (0) #define CHECK_CONTIGUOUS(x) \ do { \ AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \ } while (0) #define CHECK_IS_INT(x) \ do { \ AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \ #x " must be an int tensor"); \ } while (0) #define CHECK_IS_FLOAT(x) \ do { \ AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \ #x " must be a float tensor"); \ } while (0) ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/ball_query.cpp ================================================ #include "ball_query.h" #include "utils.h" void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx); at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, const int nsample) { CHECK_CONTIGUOUS(new_xyz); CHECK_CONTIGUOUS(xyz); CHECK_IS_FLOAT(new_xyz); CHECK_IS_FLOAT(xyz); if (new_xyz.is_cuda()) { CHECK_CUDA(xyz); } at::Tensor idx = torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample}, at::device(new_xyz.device()).dtype(at::ScalarType::Int)); if (new_xyz.is_cuda()) { query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), radius, nsample, new_xyz.data_ptr(), xyz.data_ptr(), idx.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return idx; } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/ball_query_gpu.cu ================================================ #include #include #include #include "cuda_utils.h" // input: new_xyz(b, m, 3) xyz(b, n, 3) // output: idx(b, m, nsample) __global__ void query_ball_point_kernel(int b, int n, int m, float radius, int nsample, const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) { int batch_index = blockIdx.x; xyz += batch_index * n * 3; new_xyz += batch_index * m * 3; idx += m * nsample * batch_index; int index = threadIdx.x; int stride = blockDim.x; float radius2 = radius * radius; for (int j = index; j < m; j += stride) { float new_x = new_xyz[j * 3 + 0]; float new_y = new_xyz[j * 3 + 1]; float new_z = new_xyz[j * 3 + 2]; for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) { float x = xyz[k * 3 + 0]; float y = xyz[k * 3 + 1]; float z = xyz[k * 3 + 2]; float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) { idx[j * nsample + l] = k; } } idx[j * nsample + cnt] = k; ++cnt; } } } } void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, int nsample, const float *new_xyz, const float *xyz, int *idx) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); query_ball_point_kernel<<>>( b, n, m, radius, nsample, new_xyz, xyz, idx); CUDA_CHECK_ERRORS(); } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/bindings.cpp ================================================ #include "ball_query.h" #include "group_points.h" #include "interpolate.h" #include "sampling.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("gather_points", &gather_points); m.def("gather_points_grad", &gather_points_grad); m.def("furthest_point_sampling", &furthest_point_sampling); m.def("three_nn", &three_nn); m.def("three_interpolate", &three_interpolate); m.def("three_interpolate_grad", &three_interpolate_grad); m.def("ball_query", &ball_query); m.def("group_points", &group_points); m.def("group_points_grad", &group_points_grad); } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/group_points.cpp ================================================ #include "group_points.h" #include "utils.h" void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out); void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, int nsample, const float *grad_out, const int *idx, float *grad_points); at::Tensor group_points(at::Tensor points, at::Tensor idx) { CHECK_CONTIGUOUS(points); CHECK_CONTIGUOUS(idx); CHECK_IS_FLOAT(points); CHECK_IS_INT(idx); if (points.is_cuda()) { CHECK_CUDA(idx); } at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)}, at::device(points.device()).dtype(at::ScalarType::Float)); if (points.is_cuda()) { group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), idx.size(1), idx.size(2), points.data_ptr(), idx.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) { CHECK_CONTIGUOUS(grad_out); CHECK_CONTIGUOUS(idx); CHECK_IS_FLOAT(grad_out); CHECK_IS_INT(idx); if (grad_out.is_cuda()) { CHECK_CUDA(idx); } at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), n}, at::device(grad_out.device()).dtype(at::ScalarType::Float)); if (grad_out.is_cuda()) { group_points_grad_kernel_wrapper( grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2), grad_out.data_ptr(), idx.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/group_points_gpu.cu ================================================ #include #include #include "cuda_utils.h" // input: points(b, c, n) idx(b, npoints, nsample) // output: out(b, c, npoints, nsample) __global__ void group_points_kernel(int b, int c, int n, int npoints, int nsample, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { int batch_index = blockIdx.x; points += batch_index * n * c; idx += batch_index * npoints * nsample; out += batch_index * npoints * nsample * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * npoints; i += stride) { const int l = i / npoints; const int j = i % npoints; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; out[(l * npoints + j) * nsample + k] = points[l * n + ii]; } } } void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, const float *points, const int *idx, float *out) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); group_points_kernel<<>>( b, c, n, npoints, nsample, points, idx, out); CUDA_CHECK_ERRORS(); } // input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample) // output: grad_points(b, c, n) __global__ void group_points_grad_kernel(int b, int c, int n, int npoints, int nsample, const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) { int batch_index = blockIdx.x; grad_out += batch_index * npoints * nsample * c; idx += batch_index * npoints * nsample; grad_points += batch_index * n * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * npoints; i += stride) { const int l = i / npoints; const int j = i % npoints; for (int k = 0; k < nsample; ++k) { int ii = idx[j * nsample + k]; atomicAdd(grad_points + l * n + ii, grad_out[(l * npoints + j) * nsample + k]); } } } void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, int nsample, const float *grad_out, const int *idx, float *grad_points) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); group_points_grad_kernel<<>>( b, c, n, npoints, nsample, grad_out, idx, grad_points); CUDA_CHECK_ERRORS(); } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/interpolate.cpp ================================================ #include "interpolate.h" #include "utils.h" void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx); void three_interpolate_kernel_wrapper(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out); void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points); std::vector three_nn(at::Tensor unknowns, at::Tensor knows) { CHECK_CONTIGUOUS(unknowns); CHECK_CONTIGUOUS(knows); CHECK_IS_FLOAT(unknowns); CHECK_IS_FLOAT(knows); if (unknowns.is_cuda()) { CHECK_CUDA(knows); } at::Tensor idx = torch::zeros({unknowns.size(0), unknowns.size(1), 3}, at::device(unknowns.device()).dtype(at::ScalarType::Int)); at::Tensor dist2 = torch::zeros({unknowns.size(0), unknowns.size(1), 3}, at::device(unknowns.device()).dtype(at::ScalarType::Float)); if (unknowns.is_cuda()) { three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1), unknowns.data_ptr(), knows.data_ptr(), dist2.data_ptr(), idx.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return {dist2, idx}; } at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, at::Tensor weight) { CHECK_CONTIGUOUS(points); CHECK_CONTIGUOUS(idx); CHECK_CONTIGUOUS(weight); CHECK_IS_FLOAT(points); CHECK_IS_INT(idx); CHECK_IS_FLOAT(weight); if (points.is_cuda()) { CHECK_CUDA(idx); CHECK_CUDA(weight); } at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1)}, at::device(points.device()).dtype(at::ScalarType::Float)); if (points.is_cuda()) { three_interpolate_kernel_wrapper( points.size(0), points.size(1), points.size(2), idx.size(1), points.data_ptr(), idx.data_ptr(), weight.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, at::Tensor weight, const int m) { CHECK_CONTIGUOUS(grad_out); CHECK_CONTIGUOUS(idx); CHECK_CONTIGUOUS(weight); CHECK_IS_FLOAT(grad_out); CHECK_IS_INT(idx); CHECK_IS_FLOAT(weight); if (grad_out.is_cuda()) { CHECK_CUDA(idx); CHECK_CUDA(weight); } at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), m}, at::device(grad_out.device()).dtype(at::ScalarType::Float)); if (grad_out.is_cuda()) { three_interpolate_grad_kernel_wrapper( grad_out.size(0), grad_out.size(1), grad_out.size(2), m, grad_out.data_ptr(), idx.data_ptr(), weight.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/interpolate_gpu.cu ================================================ #include #include #include #include "cuda_utils.h" // input: unknown(b, n, 3) known(b, m, 3) // output: dist2(b, n, 3), idx(b, n, 3) __global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown, const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { int batch_index = blockIdx.x; unknown += batch_index * n * 3; known += batch_index * m * 3; dist2 += batch_index * n * 3; idx += batch_index * n * 3; int index = threadIdx.x; int stride = blockDim.x; for (int j = index; j < n; j += stride) { float ux = unknown[j * 3 + 0]; float uy = unknown[j * 3 + 1]; float uz = unknown[j * 3 + 2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { float x = known[k * 3 + 0]; float y = known[k * 3 + 1]; float z = known[k * 3 + 2]; float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[j * 3 + 0] = best1; dist2[j * 3 + 1] = best2; dist2[j * 3 + 2] = best3; idx[j * 3 + 0] = besti1; idx[j * 3 + 1] = besti2; idx[j * 3 + 2] = besti3; } } void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, const float *known, float *dist2, int *idx) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); three_nn_kernel<<>>(b, n, m, unknown, known, dist2, idx); CUDA_CHECK_ERRORS(); } // input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) // output: out(b, c, n) __global__ void three_interpolate_kernel(int b, int c, int m, int n, const float *__restrict__ points, const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) { int batch_index = blockIdx.x; points += batch_index * m * c; idx += batch_index * n * 3; weight += batch_index * n * 3; out += batch_index * n * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * n; i += stride) { const int l = i / n; const int j = i % n; float w1 = weight[j * 3 + 0]; float w2 = weight[j * 3 + 1]; float w3 = weight[j * 3 + 2]; int i1 = idx[j * 3 + 0]; int i2 = idx[j * 3 + 1]; int i3 = idx[j * 3 + 2]; out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + points[l * m + i3] * w3; } } void three_interpolate_kernel_wrapper(int b, int c, int m, int n, const float *points, const int *idx, const float *weight, float *out) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); three_interpolate_kernel<<>>( b, c, m, n, points, idx, weight, out); CUDA_CHECK_ERRORS(); } // input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3) // output: grad_points(b, c, m) __global__ void three_interpolate_grad_kernel( int b, int c, int n, int m, const float *__restrict__ grad_out, const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) { int batch_index = blockIdx.x; grad_out += batch_index * n * c; idx += batch_index * n * 3; weight += batch_index * n * 3; grad_points += batch_index * m * c; const int index = threadIdx.y * blockDim.x + threadIdx.x; const int stride = blockDim.y * blockDim.x; for (int i = index; i < c * n; i += stride) { const int l = i / n; const int j = i % n; float w1 = weight[j * 3 + 0]; float w2 = weight[j * 3 + 1]; float w3 = weight[j * 3 + 2]; int i1 = idx[j * 3 + 0]; int i2 = idx[j * 3 + 1]; int i3 = idx[j * 3 + 2]; atomicAdd(grad_points + l * m + i1, grad_out[i] * w1); atomicAdd(grad_points + l * m + i2, grad_out[i] * w2); atomicAdd(grad_points + l * m + i3, grad_out[i] * w3); } } void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, const float *grad_out, const int *idx, const float *weight, float *grad_points) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); three_interpolate_grad_kernel<<>>( b, c, n, m, grad_out, idx, weight, grad_points); CUDA_CHECK_ERRORS(); } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/sampling.cpp ================================================ #include "sampling.h" #include "utils.h" void gather_points_kernel_wrapper(int b, int c, int n, int npoints, const float *points, const int *idx, float *out); void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, const float *grad_out, const int *idx, float *grad_points); void furthest_point_sampling_kernel_wrapper(int b, int n, int m, const float *dataset, float *temp, int *idxs); at::Tensor gather_points(at::Tensor points, at::Tensor idx) { CHECK_CONTIGUOUS(points); CHECK_CONTIGUOUS(idx); CHECK_IS_FLOAT(points); CHECK_IS_INT(idx); if (points.is_cuda()) { CHECK_CUDA(idx); } at::Tensor output = torch::zeros({points.size(0), points.size(1), idx.size(1)}, at::device(points.device()).dtype(at::ScalarType::Float)); if (points.is_cuda()) { gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), idx.size(1), points.data_ptr(), idx.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) { CHECK_CONTIGUOUS(grad_out); CHECK_CONTIGUOUS(idx); CHECK_IS_FLOAT(grad_out); CHECK_IS_INT(idx); if (grad_out.is_cuda()) { CHECK_CUDA(idx); } at::Tensor output = torch::zeros({grad_out.size(0), grad_out.size(1), n}, at::device(grad_out.device()).dtype(at::ScalarType::Float)); if (grad_out.is_cuda()) { gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n, idx.size(1), grad_out.data_ptr(), idx.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) { CHECK_CONTIGUOUS(points); CHECK_IS_FLOAT(points); at::Tensor output = torch::zeros({points.size(0), nsamples}, at::device(points.device()).dtype(at::ScalarType::Int)); at::Tensor tmp = torch::full({points.size(0), points.size(1)}, 1e10, at::device(points.device()).dtype(at::ScalarType::Float)); if (points.is_cuda()) { furthest_point_sampling_kernel_wrapper( points.size(0), points.size(1), nsamples, points.data_ptr(), tmp.data_ptr(), output.data_ptr()); } else { AT_ASSERT(false, "CPU not supported"); } return output; } ================================================ FILE: modules/third_party/pointnet2/_ext_src/src/sampling_gpu.cu ================================================ #include #include #include "cuda_utils.h" // input: points(b, c, n) idx(b, m) // output: out(b, c, m) __global__ void gather_points_kernel(int b, int c, int n, int m, const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) { for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int l = blockIdx.y; l < c; l += gridDim.y) { for (int j = threadIdx.x; j < m; j += blockDim.x) { int a = idx[i * m + j]; out[(i * c + l) * m + j] = points[(i * c + l) * n + a]; } } } } void gather_points_kernel_wrapper(int b, int c, int n, int npoints, const float *points, const int *idx, float *out) { gather_points_kernel<<>>(b, c, n, npoints, points, idx, out); CUDA_CHECK_ERRORS(); } // input: grad_out(b, c, m) idx(b, m) // output: grad_points(b, c, n) __global__ void gather_points_grad_kernel(int b, int c, int n, int m, const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) { for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int l = blockIdx.y; l < c; l += gridDim.y) { for (int j = threadIdx.x; j < m; j += blockDim.x) { int a = idx[i * m + j]; atomicAdd(grad_points + (i * c + l) * n + a, grad_out[(i * c + l) * m + j]); } } } } void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, const float *grad_out, const int *idx, float *grad_points) { gather_points_grad_kernel<<>>( b, c, n, npoints, grad_out, idx, grad_points); CUDA_CHECK_ERRORS(); } __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2) { const float v1 = dists[idx1], v2 = dists[idx2]; const int i1 = dists_i[idx1], i2 = dists_i[idx2]; dists[idx1] = max(v1, v2); dists_i[idx1] = v2 > v1 ? i2 : i1; } // Input dataset: (b, n, 3), tmp: (b, n) // Ouput idxs (b, m) template __global__ void furthest_point_sampling_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * 3; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; float x1 = dataset[old * 3 + 0]; float y1 = dataset[old * 3 + 1]; float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { float x2, y2, z2; x2 = dataset[k * 3 + 0]; y2 = dataset[k * 3 + 1]; z2 = dataset[k * 3 + 2]; float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); if (mag <= 1e-3) continue; float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); if (block_size >= 512) { if (tid < 256) { __update(dists, dists_i, tid, tid + 256); } __syncthreads(); } if (block_size >= 256) { if (tid < 128) { __update(dists, dists_i, tid, tid + 128); } __syncthreads(); } if (block_size >= 128) { if (tid < 64) { __update(dists, dists_i, tid, tid + 64); } __syncthreads(); } if (block_size >= 64) { if (tid < 32) { __update(dists, dists_i, tid, tid + 32); } __syncthreads(); } if (block_size >= 32) { if (tid < 16) { __update(dists, dists_i, tid, tid + 16); } __syncthreads(); } if (block_size >= 16) { if (tid < 8) { __update(dists, dists_i, tid, tid + 8); } __syncthreads(); } if (block_size >= 8) { if (tid < 4) { __update(dists, dists_i, tid, tid + 4); } __syncthreads(); } if (block_size >= 4) { if (tid < 2) { __update(dists, dists_i, tid, tid + 2); } __syncthreads(); } if (block_size >= 2) { if (tid < 1) { __update(dists, dists_i, tid, tid + 1); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } void furthest_point_sampling_kernel_wrapper(int b, int n, int m, const float *dataset, float *temp, int *idxs) { unsigned int n_threads = opt_n_threads(n); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); switch (n_threads) { case 512: furthest_point_sampling_kernel<512> <<>>(b, n, m, dataset, temp, idxs); break; case 256: furthest_point_sampling_kernel<256> <<>>(b, n, m, dataset, temp, idxs); break; case 128: furthest_point_sampling_kernel<128> <<>>(b, n, m, dataset, temp, idxs); break; case 64: furthest_point_sampling_kernel<64> <<>>(b, n, m, dataset, temp, idxs); break; case 32: furthest_point_sampling_kernel<32> <<>>(b, n, m, dataset, temp, idxs); break; case 16: furthest_point_sampling_kernel<16> <<>>(b, n, m, dataset, temp, idxs); break; case 8: furthest_point_sampling_kernel<8> <<>>(b, n, m, dataset, temp, idxs); break; case 4: furthest_point_sampling_kernel<4> <<>>(b, n, m, dataset, temp, idxs); break; case 2: furthest_point_sampling_kernel<2> <<>>(b, n, m, dataset, temp, idxs); break; case 1: furthest_point_sampling_kernel<1> <<>>(b, n, m, dataset, temp, idxs); break; default: furthest_point_sampling_kernel<512> <<>>(b, n, m, dataset, temp, idxs); } CUDA_CHECK_ERRORS(); } ================================================ FILE: modules/third_party/pointnet2/_version.py ================================================ __version__ = "3.0.0" ================================================ FILE: modules/third_party/pointnet2/pointnet2_modules.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ''' Pointnet2 layers. Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch Extended with the following: 1. Uniform sampling in each local region (sample_uniformly) 2. Return sampled points indices to support votenet. ''' import torch import torch.nn as nn import torch.nn.functional as F import os import sys BASE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(BASE_DIR) import pointnet2_utils import pytorch_utils as pt_utils from typing import List class _PointnetSAModuleBase(nn.Module): def __init__(self): super().__init__() self.npoint = None self.groupers = None self.mlps = None def forward(self, xyz: torch.Tensor, features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): r""" Parameters ---------- xyz : torch.Tensor (B, N, 3) tensor of the xyz coordinates of the features features : torch.Tensor (B, N, C) tensor of the descriptors of the the features Returns ------- new_xyz : torch.Tensor (B, npoint, 3) tensor of the new features' xyz new_features : torch.Tensor (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors """ new_features_list = [] xyz_flipped = xyz.transpose(1, 2).contiguous() new_xyz = pointnet2_utils.gather_operation( xyz_flipped, pointnet2_utils.furthest_point_sample(xyz, self.npoint) ).transpose(1, 2).contiguous() if self.npoint is not None else None for i in range(len(self.groupers)): new_features = self.groupers[i]( xyz, new_xyz, features ) # (B, C, npoint, nsample) new_features = self.mlps[i]( new_features ) # (B, mlp[-1], npoint, nsample) new_features = F.max_pool2d( new_features, kernel_size=[1, new_features.size(3)] ) # (B, mlp[-1], npoint, 1) new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1) class PointnetSAModuleMSG(_PointnetSAModuleBase): r"""Pointnet set abstrction layer with multiscale grouping Parameters ---------- npoint : int Number of features radii : list of float32 list of radii to group with nsamples : list of int32 Number of samples in each ball query mlps : list of list of int32 Spec of the pointnet before the global max_pool for each scale bn : bool Use batchnorm """ def __init__( self, *, npoint: int, radii: List[float], nsamples: List[int], mlps: List[List[int]], bn: bool = True, use_xyz: bool = True, sample_uniformly: bool = False ): super().__init__() assert len(radii) == len(nsamples) == len(mlps) self.npoint = npoint self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() for i in range(len(radii)): radius = radii[i] nsample = nsamples[i] self.groupers.append( pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly) if npoint is not None else pointnet2_utils.GroupAll(use_xyz) ) mlp_spec = mlps[i] if use_xyz: mlp_spec[0] += 3 self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) class PointnetSAModule(PointnetSAModuleMSG): r"""Pointnet set abstrction layer Parameters ---------- npoint : int Number of features radius : float Radius of ball nsample : int Number of samples in the ball query mlp : list Spec of the pointnet before the global max_pool bn : bool Use batchnorm """ def __init__( self, *, mlp: List[int], npoint: int = None, radius: float = None, nsample: int = None, bn: bool = True, use_xyz: bool = True ): super().__init__( mlps=[mlp], npoint=npoint, radii=[radius], nsamples=[nsample], bn=bn, use_xyz=use_xyz ) class PointnetSAModuleVotes(nn.Module): ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG with extra support for returning point indices for getting their GT votes ''' def __init__( self, *, mlp: List[int], npoint: int = None, radius: float = None, nsample: int = None, bn: bool = True, use_xyz: bool = True, pooling: str = 'max', sigma: float = None, # for RBF pooling normalize_xyz: bool = False, # noramlize local XYZ with radius sample_uniformly: bool = False, ret_unique_cnt: bool = False ): super().__init__() self.npoint = npoint self.radius = radius self.nsample = nsample self.pooling = pooling self.mlp_module = None self.use_xyz = use_xyz self.sigma = sigma if self.sigma is None: self.sigma = self.radius/2 self.normalize_xyz = normalize_xyz self.ret_unique_cnt = ret_unique_cnt if npoint is not None: self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, ret_grouped_xyz=True, normalize_xyz=normalize_xyz, sample_uniformly=sample_uniformly, ret_unique_cnt=ret_unique_cnt) else: self.grouper = pointnet2_utils.GroupAll(use_xyz, ret_grouped_xyz=True) mlp_spec = mlp if use_xyz and len(mlp_spec)>0: mlp_spec[0] += 3 self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn) def forward(self, xyz: torch.Tensor, features: torch.Tensor = None, inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): r""" Parameters ---------- xyz : torch.Tensor (B, N, 3) tensor of the xyz coordinates of the features features : torch.Tensor (B, C, N) tensor of the descriptors of the the features inds : torch.Tensor (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) Returns ------- new_xyz : torch.Tensor (B, npoint, 3) tensor of the new features' xyz new_features : torch.Tensor (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors inds: torch.Tensor (B, npoint) tensor of the inds """ xyz_flipped = xyz.transpose(1, 2).contiguous() if inds is None: inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) else: assert(inds.shape[1] == self.npoint) new_xyz = pointnet2_utils.gather_operation( xyz_flipped, inds ).transpose(1, 2).contiguous() if self.npoint is not None else None if not self.ret_unique_cnt: grouped_features, grouped_xyz = self.grouper( xyz, new_xyz, features ) # (B, C, npoint, nsample) else: grouped_features, grouped_xyz, unique_cnt = self.grouper( xyz, new_xyz, features ) # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint) new_features = self.mlp_module( grouped_features ) # (B, mlp[-1], npoint, nsample) if self.pooling == 'max': new_features = F.max_pool2d( new_features, kernel_size=[1, new_features.size(3)] ) # (B, mlp[-1], npoint, 1) elif self.pooling == 'avg': new_features = F.avg_pool2d( new_features, kernel_size=[1, new_features.size(3)] ) # (B, mlp[-1], npoint, 1) elif self.pooling == 'rbf': # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma) # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel rbf = torch.exp(-1 * grouped_xyz.pow(2).sum(1,keepdim=False) / (self.sigma**2) / 2) # (B, npoint, nsample) new_features = torch.sum(new_features * rbf.unsqueeze(1), -1, keepdim=True) / float(self.nsample) # (B, mlp[-1], npoint, 1) new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) if not self.ret_unique_cnt: return new_xyz, new_features, inds else: return new_xyz, new_features, inds, unique_cnt class PointnetSAModuleMSGVotes(nn.Module): ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG with extra support for returning point indices for getting their GT votes ''' def __init__( self, *, mlps: List[List[int]], npoint: int, radii: List[float], nsamples: List[int], bn: bool = True, use_xyz: bool = True, sample_uniformly: bool = False ): super().__init__() assert(len(mlps) == len(nsamples) == len(radii)) self.npoint = npoint self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() for i in range(len(radii)): radius = radii[i] nsample = nsamples[i] self.groupers.append( pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly) if npoint is not None else pointnet2_utils.GroupAll(use_xyz) ) mlp_spec = mlps[i] if use_xyz: mlp_spec[0] += 3 self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) def forward(self, xyz: torch.Tensor, features: torch.Tensor = None, inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): r""" Parameters ---------- xyz : torch.Tensor (B, N, 3) tensor of the xyz coordinates of the features features : torch.Tensor (B, C, C) tensor of the descriptors of the the features inds : torch.Tensor (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) Returns ------- new_xyz : torch.Tensor (B, npoint, 3) tensor of the new features' xyz new_features : torch.Tensor (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors inds: torch.Tensor (B, npoint) tensor of the inds """ new_features_list = [] xyz_flipped = xyz.transpose(1, 2).contiguous() if inds is None: inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) new_xyz = pointnet2_utils.gather_operation( xyz_flipped, inds ).transpose(1, 2).contiguous() if self.npoint is not None else None for i in range(len(self.groupers)): new_features = self.groupers[i]( xyz, new_xyz, features ) # (B, C, npoint, nsample) new_features = self.mlps[i]( new_features ) # (B, mlp[-1], npoint, nsample) new_features = F.max_pool2d( new_features, kernel_size=[1, new_features.size(3)] ) # (B, mlp[-1], npoint, 1) new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1), inds class PointnetFPModule(nn.Module): r"""Propigates the features of one set to another Parameters ---------- mlp : list Pointnet module parameters bn : bool Use batchnorm """ def __init__(self, *, mlp: List[int], bn: bool = True): super().__init__() self.mlp = pt_utils.SharedMLP(mlp, bn=bn) def forward( self, unknown: torch.Tensor, known: torch.Tensor, unknow_feats: torch.Tensor, known_feats: torch.Tensor ) -> torch.Tensor: r""" Parameters ---------- unknown : torch.Tensor (B, n, 3) tensor of the xyz positions of the unknown features known : torch.Tensor (B, m, 3) tensor of the xyz positions of the known features unknow_feats : torch.Tensor (B, C1, n) tensor of the features to be propigated to known_feats : torch.Tensor (B, C2, m) tensor of features to be propigated Returns ------- new_features : torch.Tensor (B, mlp[-1], n) tensor of the features of the unknown features """ if known is not None: dist, idx = pointnet2_utils.three_nn(unknown, known) dist_recip = 1.0 / (dist + 1e-8) norm = torch.sum(dist_recip, dim=2, keepdim=True) weight = dist_recip / norm interpolated_feats = pointnet2_utils.three_interpolate( known_feats, idx, weight ) else: interpolated_feats = known_feats.expand( *known_feats.size()[0:2], unknown.size(1) ) if unknow_feats is not None: new_features = torch.cat([interpolated_feats, unknow_feats], dim=1) #(B, C2 + C1, n) else: new_features = interpolated_feats new_features = new_features.unsqueeze(-1) new_features = self.mlp(new_features) return new_features.squeeze(-1) class PointnetLFPModuleMSG(nn.Module): ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG learnable feature propagation layer.''' def __init__( self, *, mlps: List[List[int]], radii: List[float], nsamples: List[int], post_mlp: List[int], bn: bool = True, use_xyz: bool = True, sample_uniformly: bool = False ): super().__init__() assert(len(mlps) == len(nsamples) == len(radii)) self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn) self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() for i in range(len(radii)): radius = radii[i] nsample = nsamples[i] self.groupers.append( pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly) ) mlp_spec = mlps[i] if use_xyz: mlp_spec[0] += 3 self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) def forward(self, xyz2: torch.Tensor, xyz1: torch.Tensor, features2: torch.Tensor, features1: torch.Tensor) -> torch.Tensor: r""" Propagate features from xyz1 to xyz2. Parameters ---------- xyz2 : torch.Tensor (B, N2, 3) tensor of the xyz coordinates of the features xyz1 : torch.Tensor (B, N1, 3) tensor of the xyz coordinates of the features features2 : torch.Tensor (B, C2, N2) tensor of the descriptors of the the features features1 : torch.Tensor (B, C1, N1) tensor of the descriptors of the the features Returns ------- new_features1 : torch.Tensor (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors """ new_features_list = [] for i in range(len(self.groupers)): new_features = self.groupers[i]( xyz1, xyz2, features1 ) # (B, C1, N2, nsample) new_features = self.mlps[i]( new_features ) # (B, mlp[-1], N2, nsample) new_features = F.max_pool2d( new_features, kernel_size=[1, new_features.size(3)] ) # (B, mlp[-1], N2, 1) new_features = new_features.squeeze(-1) # (B, mlp[-1], N2) if features2 is not None: new_features = torch.cat([new_features, features2], dim=1) #(B, mlp[-1] + C2, N2) new_features = new_features.unsqueeze(-1) new_features = self.post_mlp(new_features) new_features_list.append(new_features) return torch.cat(new_features_list, dim=1).squeeze(-1) if __name__ == "__main__": from torch.autograd import Variable torch.manual_seed(1) torch.cuda.manual_seed_all(1) xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True) xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True) test_module = PointnetSAModuleMSG( npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]] ) test_module.cuda() print(test_module(xyz, xyz_feats)) for _ in range(1): _, new_features = test_module(xyz, xyz_feats) new_features.backward( torch.cuda.FloatTensor(*new_features.size()).fill_(1) ) print(new_features) print(xyz.grad) ================================================ FILE: modules/third_party/pointnet2/pointnet2_test.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ''' Testing customized ops. ''' import torch from torch.autograd import gradcheck import numpy as np import os import sys BASE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(BASE_DIR) import pointnet2_utils def test_interpolation_grad(): batch_size = 1 feat_dim = 2 m = 4 feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda() def interpolate_func(inputs): idx = torch.from_numpy(np.array([[[0,1,2],[1,2,3]]])).int().cuda() weight = torch.from_numpy(np.array([[[1,1,1],[2,2,2]]])).float().cuda() interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight) return interpolated_feats assert (gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1)) if __name__=='__main__': test_interpolation_grad() ================================================ FILE: modules/third_party/pointnet2/pointnet2_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ''' Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch ''' from __future__ import ( division, absolute_import, with_statement, print_function, unicode_literals, ) import torch from torch.autograd import Function import torch.nn as nn import modules.third_party.pointnet2.pytorch_utils as pt_utils import builtins try: import pointnet2._ext as _ext except ImportError: if not getattr(builtins, "__POINTNET2_SETUP__", False): raise ImportError( "Could not import _ext module.\n" "Please see the setup instructions in the README: " "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst" ) if False: # Workaround for type hints without depending on the `typing` module from typing import * class RandomDropout(nn.Module): def __init__(self, p=0.5, inplace=False): super(RandomDropout, self).__init__() self.p = p self.inplace = inplace def forward(self, X): theta = torch.Tensor(1).uniform_(0, self.p)[0] return pt_utils.feature_dropout_no_scaling(X, theta, self.train, self.inplace) class FurthestPointSampling(Function): @staticmethod def forward(ctx, xyz, npoint): # type: (Any, torch.Tensor, int) -> torch.Tensor r""" Uses iterative furthest point sampling to select a set of npoint features that have the largest minimum distance Parameters ---------- xyz : torch.Tensor (B, N, 3) tensor where N > npoint npoint : int32 number of features in the sampled set Returns ------- torch.Tensor (B, npoint) tensor containing the set """ fps_inds = _ext.furthest_point_sampling(xyz, npoint) ctx.mark_non_differentiable(fps_inds) return fps_inds @staticmethod def backward(xyz, a=None): return None, None furthest_point_sample = FurthestPointSampling.apply class GatherOperation(Function): @staticmethod def forward(ctx, features, idx): # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor r""" Parameters ---------- features : torch.Tensor (B, C, N) tensor idx : torch.Tensor (B, npoint) tensor of the features to gather Returns ------- torch.Tensor (B, C, npoint) tensor """ _, C, N = features.size() ctx.for_backwards = (idx, C, N) return _ext.gather_points(features, idx) @staticmethod def backward(ctx, grad_out): idx, C, N = ctx.for_backwards grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N) return grad_features, None gather_operation = GatherOperation.apply class ThreeNN(Function): @staticmethod def forward(ctx, unknown, known): # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] r""" Find the three nearest neighbors of unknown in known Parameters ---------- unknown : torch.Tensor (B, n, 3) tensor of known features known : torch.Tensor (B, m, 3) tensor of unknown features Returns ------- dist : torch.Tensor (B, n, 3) l2 distance to the three nearest neighbors idx : torch.Tensor (B, n, 3) index of 3 nearest neighbors """ dist2, idx = _ext.three_nn(unknown, known) return torch.sqrt(dist2), idx @staticmethod def backward(ctx, a=None, b=None): return None, None three_nn = ThreeNN.apply class ThreeInterpolate(Function): @staticmethod def forward(ctx, features, idx, weight): # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor r""" Performs weight linear interpolation on 3 features Parameters ---------- features : torch.Tensor (B, c, m) Features descriptors to be interpolated from idx : torch.Tensor (B, n, 3) three nearest neighbors of the target features in features weight : torch.Tensor (B, n, 3) weights Returns ------- torch.Tensor (B, c, n) tensor of the interpolated features """ B, c, m = features.size() n = idx.size(1) ctx.three_interpolate_for_backward = (idx, weight, m) return _ext.three_interpolate(features, idx, weight) @staticmethod def backward(ctx, grad_out): # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] r""" Parameters ---------- grad_out : torch.Tensor (B, c, n) tensor with gradients of ouputs Returns ------- grad_features : torch.Tensor (B, c, m) tensor with gradients of features None None """ idx, weight, m = ctx.three_interpolate_for_backward grad_features = _ext.three_interpolate_grad( grad_out.contiguous(), idx, weight, m ) return grad_features, None, None three_interpolate = ThreeInterpolate.apply class GroupingOperation(Function): @staticmethod def forward(ctx, features, idx): # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor r""" Parameters ---------- features : torch.Tensor (B, C, N) tensor of features to group idx : torch.Tensor (B, npoint, nsample) tensor containing the indicies of features to group with Returns ------- torch.Tensor (B, C, npoint, nsample) tensor """ B, nfeatures, nsample = idx.size() _, C, N = features.size() ctx.for_backwards = (idx, N) return _ext.group_points(features, idx) @staticmethod def backward(ctx, grad_out): # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor] r""" Parameters ---------- grad_out : torch.Tensor (B, C, npoint, nsample) tensor of the gradients of the output from forward Returns ------- torch.Tensor (B, C, N) gradient of the features None """ idx, N = ctx.for_backwards grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N) return grad_features, None grouping_operation = GroupingOperation.apply class BallQuery(Function): @staticmethod def forward(ctx, radius, nsample, xyz, new_xyz): # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor r""" Parameters ---------- radius : float radius of the balls nsample : int maximum number of features in the balls xyz : torch.Tensor (B, N, 3) xyz coordinates of the features new_xyz : torch.Tensor (B, npoint, 3) centers of the ball query Returns ------- torch.Tensor (B, npoint, nsample) tensor with the indicies of the features that form the query balls """ inds = _ext.ball_query(new_xyz, xyz, radius, nsample) ctx.mark_non_differentiable(inds) return inds @staticmethod def backward(ctx, a=None): return None, None, None, None ball_query = BallQuery.apply class QueryAndGroup(nn.Module): r""" Groups with a ball query of radius Parameters --------- radius : float32 Radius of ball nsample : int32 Maximum number of features to gather in the ball """ def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, sample_uniformly=False, ret_unique_cnt=False): # type: (QueryAndGroup, float, int, bool) -> None super(QueryAndGroup, self).__init__() self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz self.ret_grouped_xyz = ret_grouped_xyz self.normalize_xyz = normalize_xyz self.sample_uniformly = sample_uniformly self.ret_unique_cnt = ret_unique_cnt if self.ret_unique_cnt: assert(self.sample_uniformly) def forward(self, xyz, new_xyz, features=None): # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor] r""" Parameters ---------- xyz : torch.Tensor xyz coordinates of the features (B, N, 3) new_xyz : torch.Tensor centriods (B, npoint, 3) features : torch.Tensor Descriptors of the features (B, C, N) Returns ------- new_features : torch.Tensor (B, 3 + C, npoint, nsample) tensor """ idx = ball_query(self.radius, self.nsample, xyz, new_xyz) if self.sample_uniformly: unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) for i_batch in range(idx.shape[0]): for i_region in range(idx.shape[1]): unique_ind = torch.unique(idx[i_batch, i_region, :]) num_unique = unique_ind.shape[0] unique_cnt[i_batch, i_region] = num_unique sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long) all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) idx[i_batch, i_region, :] = all_ind xyz_trans = xyz.transpose(1, 2).contiguous() grouped_xyz = grouping_operation(xyz_trans, idx) # (B, 3, npoint, nsample) grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1) if self.normalize_xyz: grouped_xyz /= self.radius if features is not None: grouped_features = grouping_operation(features, idx) if self.use_xyz: new_features = torch.cat( [grouped_xyz, grouped_features], dim=1 ) # (B, C + 3, npoint, nsample) else: new_features = grouped_features else: assert ( self.use_xyz ), "Cannot have not features and not use xyz as a feature!" new_features = grouped_xyz ret = [new_features] if self.ret_grouped_xyz: ret.append(grouped_xyz) if self.ret_unique_cnt: ret.append(unique_cnt) if len(ret) == 1: return ret[0] else: return tuple(ret) class GroupAll(nn.Module): r""" Groups all features Parameters --------- """ def __init__(self, use_xyz=True, ret_grouped_xyz=False): # type: (GroupAll, bool) -> None super(GroupAll, self).__init__() self.use_xyz = use_xyz def forward(self, xyz, new_xyz, features=None): # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor] r""" Parameters ---------- xyz : torch.Tensor xyz coordinates of the features (B, N, 3) new_xyz : torch.Tensor Ignored features : torch.Tensor Descriptors of the features (B, C, N) Returns ------- new_features : torch.Tensor (B, C + 3, 1, N) tensor """ grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) if features is not None: grouped_features = features.unsqueeze(2) if self.use_xyz: new_features = torch.cat( [grouped_xyz, grouped_features], dim=1 ) # (B, 3 + C, 1, N) else: new_features = grouped_features else: new_features = grouped_xyz return new_features ================================================ FILE: modules/third_party/pointnet2/pytorch_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ''' Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch ''' import torch import torch.nn as nn from typing import List, Tuple class SharedMLP(nn.Sequential): def __init__( self, args: List[int], *, bn: bool = False, activation=nn.ReLU(inplace=True), preact: bool = False, first: bool = False, name: str = "" ): super().__init__() for i in range(len(args) - 1): self.add_module( name + 'layer{}'.format(i), Conv2d( args[i], args[i + 1], bn=(not first or not preact or (i != 0)) and bn, activation=activation if (not first or not preact or (i != 0)) else None, preact=preact ) ) class _BNBase(nn.Sequential): def __init__(self, in_size, batch_norm=None, name=""): super().__init__() self.add_module(name + "bn", batch_norm(in_size)) nn.init.constant_(self[0].weight, 1.0) nn.init.constant_(self[0].bias, 0) class BatchNorm1d(_BNBase): def __init__(self, in_size: int, *, name: str = ""): super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name) class BatchNorm2d(_BNBase): def __init__(self, in_size: int, name: str = ""): super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name) class BatchNorm3d(_BNBase): def __init__(self, in_size: int, name: str = ""): super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name) class _ConvBase(nn.Sequential): def __init__( self, in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=None, batch_norm=None, bias=True, preact=False, name="" ): super().__init__() bias = bias and (not bn) conv_unit = conv( in_size, out_size, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias ) init(conv_unit.weight) if bias: nn.init.constant_(conv_unit.bias, 0) if bn: if not preact: bn_unit = batch_norm(out_size) else: bn_unit = batch_norm(in_size) if preact: if bn: self.add_module(name + 'bn', bn_unit) if activation is not None: self.add_module(name + 'activation', activation) self.add_module(name + 'conv', conv_unit) if not preact: if bn: self.add_module(name + 'bn', bn_unit) if activation is not None: self.add_module(name + 'activation', activation) class Conv1d(_ConvBase): def __init__( self, in_size: int, out_size: int, *, kernel_size: int = 1, stride: int = 1, padding: int = 0, activation=nn.ReLU(inplace=True), bn: bool = False, init=nn.init.kaiming_normal_, bias: bool = True, preact: bool = False, name: str = "" ): super().__init__( in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=nn.Conv1d, batch_norm=BatchNorm1d, bias=bias, preact=preact, name=name ) class Conv2d(_ConvBase): def __init__( self, in_size: int, out_size: int, *, kernel_size: Tuple[int, int] = (1, 1), stride: Tuple[int, int] = (1, 1), padding: Tuple[int, int] = (0, 0), activation=nn.ReLU(inplace=True), bn: bool = False, init=nn.init.kaiming_normal_, bias: bool = True, preact: bool = False, name: str = "" ): super().__init__( in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=nn.Conv2d, batch_norm=BatchNorm2d, bias=bias, preact=preact, name=name ) class Conv3d(_ConvBase): def __init__( self, in_size: int, out_size: int, *, kernel_size: Tuple[int, int, int] = (1, 1, 1), stride: Tuple[int, int, int] = (1, 1, 1), padding: Tuple[int, int, int] = (0, 0, 0), activation=nn.ReLU(inplace=True), bn: bool = False, init=nn.init.kaiming_normal_, bias: bool = True, preact: bool = False, name: str = "" ): super().__init__( in_size, out_size, kernel_size, stride, padding, activation, bn, init, conv=nn.Conv3d, batch_norm=BatchNorm3d, bias=bias, preact=preact, name=name ) class FC(nn.Sequential): def __init__( self, in_size: int, out_size: int, *, activation=nn.ReLU(inplace=True), bn: bool = False, init=None, preact: bool = False, name: str = "" ): super().__init__() fc = nn.Linear(in_size, out_size, bias=not bn) if init is not None: init(fc.weight) if not bn: nn.init.constant_(fc.bias, 0) if preact: if bn: self.add_module(name + 'bn', BatchNorm1d(in_size)) if activation is not None: self.add_module(name + 'activation', activation) self.add_module(name + 'fc', fc) if not preact: if bn: self.add_module(name + 'bn', BatchNorm1d(out_size)) if activation is not None: self.add_module(name + 'activation', activation) def set_bn_momentum_default(bn_momentum): def fn(m): if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): m.momentum = bn_momentum return fn class BNMomentumScheduler(object): def __init__( self, model, bn_lambda, last_epoch=-1, setter=set_bn_momentum_default ): if not isinstance(model, nn.Module): raise RuntimeError( "Class '{}' is not a PyTorch nn Module".format( type(model).__name__ ) ) self.model = model self.setter = setter self.lmbd = bn_lambda self.step(last_epoch + 1) self.last_epoch = last_epoch def step(self, epoch=None): if epoch is None: epoch = self.last_epoch + 1 self.last_epoch = epoch self.model.apply(self.setter(self.lmbd(epoch))) ================================================ FILE: modules/third_party/pointnet2/requirements_new.txt ================================================ accelerate==0.28.0 addict==2.4.0 antlr4-python3-runtime==4.9.3 appdirs==1.4.4 asttokens==2.4.1 attrs==23.2.0 blinker==1.7.0 certifi==2024.2.2 charset-normalizer==3.3.2 click==8.1.7 clip @ git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33 comm==0.2.2 ConfigArgParse==1.7 contourpy==1.2.0 cycler==0.12.1 dash==2.16.1 dash-core-components==2.0.0 dash-html-components==2.0.0 dash-table==5.0.0 decorator==5.1.1 docker-pycreds==0.4.0 einops==0.7.0 exceptiongroup==1.2.0 executing==2.0.1 fastjsonschema==2.19.1 filelock==3.13.3 Flask==3.0.2 fonttools==4.50.0 fsspec==2024.3.1 ftfy==6.2.0 fvcore==0.1.5.post20221221 gitdb==4.0.11 GitPython==3.1.42 huggingface-hub==0.22.1 hydra-core==1.3.2 idna==3.6 importlib_metadata==7.1.0 importlib_resources==6.4.0 iopath==0.1.10 ipython==8.18.1 ipywidgets==8.1.2 itsdangerous==2.1.2 jedi==0.19.1 Jinja2==3.1.3 joblib==1.3.2 jsonlines==4.0.0 jsonschema==4.21.1 jsonschema-specifications==2023.12.1 jupyter_core==5.7.2 jupyterlab_widgets==3.0.10 kiwisolver==1.4.5 MarkupSafe==2.1.5 matplotlib==3.8.3 matplotlib-inline==0.1.6 mpmath==1.3.0 nbformat==5.10.3 nest-asyncio==1.6.0 networkx==3.2.1 numpy==1.26.4 nvidia-cublas-cu11==11.11.3.6 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu11==11.8.87 nvidia-cuda-cupti-cu12==12.1.105 nvidia-cuda-nvrtc-cu11==11.8.89 nvidia-cuda-nvrtc-cu12==12.1.105 nvidia-cuda-runtime-cu11==11.8.89 nvidia-cuda-runtime-cu12==12.1.105 nvidia-cudnn-cu11==8.7.0.84 nvidia-cudnn-cu12==8.9.2.26 nvidia-cufft-cu11==10.9.0.58 nvidia-cufft-cu12==11.0.2.54 nvidia-curand-cu11==10.3.0.86 nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu11==11.4.1.48 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu11==11.7.5.86 nvidia-cusparse-cu12==12.1.0.106 nvidia-nccl-cu11==2.19.3 nvidia-nccl-cu12==2.19.3 nvidia-nvjitlink-cu12==12.4.99 nvidia-nvtx-cu11==11.8.86 nvidia-nvtx-cu12==12.1.105 omegaconf==2.3.0 open3d==0.18.0 opencv-python==4.9.0.80 packaging==24.0 pandas==2.2.1 parso==0.8.3 pexpect==4.9.0 pillow==10.2.0 platformdirs==4.2.0 plotly==5.20.0 plyfile==1.0.3 pointnet2==3.0.0 portalocker==2.8.2 prompt-toolkit==3.0.43 protobuf==4.25.3 psutil==5.9.8 ptyprocess==0.7.0 pure-eval==0.2.2 Pygments==2.17.2 pyparsing==3.1.2 pyquaternion==0.9.9 python-dateutil==2.9.0.post0 pytz==2024.1 PyYAML==6.0.1 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 retrying==1.3.4 rpds-py==0.18.0 safetensors==0.4.2 scikit-learn==1.4.1.post1 scipy==1.12.0 sentry-sdk==1.42.0 setproctitle==1.3.3 six==1.16.0 smmap==5.0.1 stack-data==0.6.3 sympy==1.12 tabulate==0.9.0 tenacity==8.2.3 termcolor==2.4.0 threadpoolctl==3.4.0 tokenizers==0.15.2 torch==2.2.0+cu118 torchvision==0.17.0+cu118 tqdm==4.66.2 traitlets==5.14.2 transformers==4.39.1 triton==2.2.0 typing_extensions==4.10.0 tzdata==2024.1 urllib3==2.2.1 wandb==0.16.4 wcwidth==0.2.13 Werkzeug==3.0.1 widgetsnbextension==4.0.10 yacs==0.1.8 zipp==3.18.1 ================================================ FILE: modules/third_party/pointnet2/setup.py ================================================ import glob import os import os.path as osp from setuptools import find_packages, setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension _this_dir = osp.dirname(osp.abspath(__file__)) _ext_src_root = "_ext_src" _ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob( "{}/src/*.cu".format(_ext_src_root) ) _ext_headers = glob.glob("{}/include/*".format(_ext_src_root)) requirements = ["torch>=1.4"] os.environ["TORCH_CUDA_ARCH_LIST"] = "3.7+PTX;5.0;6.0;6.1;6.2;7.0;7.5" exec(open("_version.py").read()) setup( name='pointnet2', version=__version__, packages=find_packages(), install_requires=requirements, ext_modules=[ CUDAExtension( name='pointnet2._ext', sources=_ext_sources, extra_compile_args={ "cxx": ["-O3"], "nvcc": ["-O3", "-Xfatbin", "-compress-all"], }, include_dirs=[osp.join(_this_dir, _ext_src_root, "include")], ) ], cmdclass={"build_ext": BuildExtension}, include_package_data=True, ) ================================================ FILE: modules/utils.py ================================================ import copy import einops import torch import torch.nn as nn import torch.nn.functional as F ######################################################### # General modules helpers ######################################################### def get_activation_fn(activation_type): if activation_type not in ["relu", "gelu", "glu"]: raise RuntimeError(f"activation function currently support relu/gelu, not {activation_type}") return getattr(F, activation_type) def get_mlp_head(input_size, hidden_size, output_size, dropout=0): return nn.Sequential(*[ nn.Linear(input_size, hidden_size), nn.ReLU(), nn.LayerNorm(hidden_size, eps=1e-12), nn.Dropout(dropout), nn.Linear(hidden_size, output_size) ]) def layer_repeat(module, N, share_layer=False): if share_layer: return nn.ModuleList([module] * N) else: return nn.ModuleList([copy.deepcopy(module) for _ in range(N - 1)] + [module]) ######################################################### # Specific modules helpers ######################################################### def calc_pairwise_locs(obj_centers, obj_whls, eps=1e-10, pairwise_rel_type='center', spatial_dist_norm=True, spatial_dim=5): if pairwise_rel_type == 'mlp': obj_locs = torch.cat([obj_centers, obj_whls], 2) pairwise_locs = torch.cat( [einops.repeat(obj_locs, 'b l d -> b l x d', x=obj_locs.size(1)), einops.repeat(obj_locs, 'b l d -> b x l d', x=obj_locs.size(1))], dim=3 ) return pairwise_locs pairwise_locs = einops.repeat(obj_centers, 'b l d -> b l 1 d') \ - einops.repeat(obj_centers, 'b l d -> b 1 l d') pairwise_dists = torch.sqrt(torch.sum(pairwise_locs ** 2, 3) + eps) # (b, l, l) if spatial_dist_norm: max_dists = torch.max(pairwise_dists.view(pairwise_dists.size(0), -1), dim=1)[0] norm_pairwise_dists = pairwise_dists / einops.repeat(max_dists, 'b -> b 1 1') else: norm_pairwise_dists = pairwise_dists if spatial_dim == 1: return norm_pairwise_dists.unsqueeze(3) pairwise_dists_2d = torch.sqrt(torch.sum(pairwise_locs[..., :2] ** 2, 3) + eps) if pairwise_rel_type == 'center': pairwise_locs = torch.stack( [norm_pairwise_dists, pairwise_locs[..., 2] / pairwise_dists, pairwise_dists_2d / pairwise_dists, pairwise_locs[..., 1] / pairwise_dists_2d, pairwise_locs[..., 0] / pairwise_dists_2d], dim=3 ) elif pairwise_rel_type == 'vertical_bottom': bottom_centers = torch.clone(obj_centers) bottom_centers[:, :, 2] -= obj_whls[:, :, 2] bottom_pairwise_locs = einops.repeat(bottom_centers, 'b l d -> b l 1 d') \ - einops.repeat(bottom_centers, 'b l d -> b 1 l d') bottom_pairwise_dists = torch.sqrt(torch.sum(bottom_pairwise_locs ** 2, 3) + eps) # (b, l, l) bottom_pairwise_dists_2d = torch.sqrt(torch.sum(bottom_pairwise_locs[..., :2] ** 2, 3) + eps) pairwise_locs = torch.stack( [norm_pairwise_dists, bottom_pairwise_locs[..., 2] / bottom_pairwise_dists, bottom_pairwise_dists_2d / bottom_pairwise_dists, pairwise_locs[..., 1] / pairwise_dists_2d, pairwise_locs[..., 0] / pairwise_dists_2d], dim=3 ) if spatial_dim == 4: pairwise_locs = pairwise_locs[..., 1:] return pairwise_locs def calc_pairwise_locs_mv(obj_centers, pairwise_rel_type='center', spatial_dist_norm=True, spatial_dim=5): eps=1e-10 pairwise_locs = einops.repeat(obj_centers, 'b l d -> b l 1 d') \ - einops.repeat(obj_centers, 'b l d -> b 1 l d') pairwise_dists = torch.sqrt(torch.sum(pairwise_locs ** 2, 3) + eps) # (b, l, l) if spatial_dist_norm: max_dists = torch.max(pairwise_dists.view(pairwise_dists.size(0), -1), dim=1)[0] norm_pairwise_dists = pairwise_dists / einops.repeat(max_dists, 'b -> b 1 1') else: norm_pairwise_dists = pairwise_dists if spatial_dim == 1: return norm_pairwise_dists.unsqueeze(3) pairwise_dists_2d = torch.sqrt(torch.sum(pairwise_locs[..., :2] ** 2, 3) + eps) if pairwise_rel_type == 'center': pairwise_locs = torch.stack( [norm_pairwise_dists, pairwise_locs[..., 2] / pairwise_dists, pairwise_dists_2d / pairwise_dists, pairwise_locs[..., 1] / pairwise_dists_2d, pairwise_locs[..., 0] / pairwise_dists_2d], dim=3 ) if spatial_dim == 4: pairwise_locs = pairwise_locs[..., 1:] return pairwise_locs # TODO: need to generalize this function to more use cases to be in modules/utils.py def get_mixup_function(mixup_strategy, mixup_stage1, mixup_stage2): if mixup_strategy is None: return None assert mixup_strategy in ['linear_decay', 'all_mixup'] if mixup_strategy == 'linear_decay': return LinearDecayMixup(mixup_stage1, mixup_stage2) elif mixup_strategy == 'all_mixup': return AllMixup() class AllMixup(nn.Module): def __init__(self) -> None: super().__init__() def forward(self, obj_sem_cls_pred, obj_labels, cur_step, total_steps): mixup_sem_cls_pred = torch.zeros_like(obj_sem_cls_pred) for i in range(mixup_sem_cls_pred.shape[0]): for j in range(mixup_sem_cls_pred.shape[1]): if obj_labels[i, j] >= 0: mixup_sem_cls_pred[i, j, obj_labels[i, j]] = 1.0 return mixup_sem_cls_pred class LinearDecayMixup(nn.Module): def __init__(self, mixup_stage1, mixup_stage2) -> None: super().__init__() self.stage1_rate = mixup_stage1 self.stage2_rate = mixup_stage2 assert self.stage2_rate > self.stage1_rate def forward(self, obj_sem_cls_pred, obj_labels, cur_step, total_steps): if cur_step < total_steps * self.stage1_rate: mixup_ratio = 1.0 elif cur_step < total_steps * self.stage2_rate: mixup_ratio = (total_steps * self.stage2_rate - cur_step) / ( (self.stage2_rate - self.stage1_rate) * total_steps) else: mixup_ratio = 0.0 # mixup mixup_sem_cls_pred = obj_sem_cls_pred.clone() # B, O, 607 random_numer = torch.rand(mixup_sem_cls_pred.shape[0:2]) # B, O mixup_mask = random_numer < mixup_ratio for i in range(mixup_sem_cls_pred.shape[0]): for j in range(mixup_sem_cls_pred.shape[1]): if mixup_mask[i, j] and obj_labels[i, j] >= 0: mixup_sem_cls_pred[i, j, :] = 0.0 mixup_sem_cls_pred[i, j, obj_labels[i, j]] = 1.0 return mixup_sem_cls_pred ================================================ FILE: modules/vision/__init__.py ================================================ from .pcd_openvocab_encoder import * from .obj_cls_encoder import * ================================================ FILE: modules/vision/obj_cls_encoder.py ================================================ import torch.nn as nn from modules.build import VISION_REGISTRY from modules.utils import get_mlp_head @VISION_REGISTRY.register() class ObjClsEncoder(nn.Module): def __init__(self, cfg, input_feat_size=768, hidden_size=768, tgt_cls_num=607): super().__init__() self.cfg = cfg self.vis_cls_head = get_mlp_head(input_feat_size, hidden_size // 2, tgt_cls_num, dropout = 0.3) def forward(self, obj_feats, **kwargs): obj_logits = self.vis_cls_head(obj_feats) return obj_logits ================================================ FILE: modules/vision/pcd_openvocab_encoder.py ================================================ import os import glob import einops import torch import torch.nn as nn import torch.nn.functional as F from modules.build import VISION_REGISTRY from modules.layers.pointnet import PointNetPP from modules.layers.transformers import TransformerSpatialEncoderLayer from modules.utils import get_mlp_head, layer_repeat, calc_pairwise_locs, get_mixup_function from modules.weights import _init_weights_bert @VISION_REGISTRY.register() class PointOpenVocabEncoder(nn.Module): def __init__(self, cfg, backbone='pointnet++', hidden_size=768, path=None, freeze=False, dim_feedforward=2048, num_attention_heads=12, spatial_dim=5, num_layers=4, dim_loc=6, pairwise_rel_type='center', use_matmul_label=False, mixup_strategy=None, mixup_stage1=None, mixup_stage2=None, lang_type='bert', lang_path=None, attn_type='spatial'): super().__init__() assert backbone in ['pointnet++'] # build backbone if backbone == 'pointnet++': self.point_feature_extractor = PointNetPP( sa_n_points=[32, 16, None], sa_n_samples=[32, 32, None], sa_radii=[0.2, 0.4, None], sa_mlps=[[3, 64, 64, 128], [128, 128, 128, 256], [256, 256, 512, 768]], ) elif backbone == 'pointnext': self.point_feature_extractor = PointNextEncoder( blocks=[1, 1, 1, 1, 1, 1], strides=[1, 2, 2, 2, 2, 1], sa_layers=2, sa_use_res=True, width=32, radius=0.15, radius_scaling=1.5, mlp_head=[1024, 768] if lang_type == 'bert' else [] ) # Open vocab grounding head vocab_file_name = f"scannet_607_{'bert-base-uncased' if lang_type == 'bert' else 'clip-ViT-B16'}_id.pth" self.register_buffer("text_features", torch.load(os.path.join(lang_path, vocab_file_name))) self.point_cls_head = lambda x: x @ self.text_features.t() self.dropout = nn.Dropout(0.1) self.attn_type = attn_type # freeze feature self.freeze = freeze if freeze: for p in self.parameters(): p.requires_grad = False # build semantic cls embeds self.sem_cls_embed_layer = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.LayerNorm(hidden_size), nn.Dropout(0.1)) # self.int2cat = json.load( # open(os.path.join(glove_path, "annotations/meta_data/scannetv2_raw_categories.json"), 'r')) # self.cat2int = {w: i for i, w in enumerate(self.int2cat)} # self.cat2vec = json.load(open(os.path.join(glove_path, "annotations/meta_data/cat2glove42b.json"), 'r')) # self.register_buffer("int2mat", torch.ones(607, 300)) # for i in range(607): # self.int2mat[i, :] = torch.Tensor(self.cat2vec[self.int2cat[i]]) self.use_matmul_label = use_matmul_label # build mask embedes self.sem_mask_embeddings = nn.Embedding(1, 768) # build spatial encoder layer if self.attn_type == 'spatial': pc_encoder_layer = TransformerSpatialEncoderLayer(hidden_size, num_attention_heads, dim_feedforward=dim_feedforward, dropout=0.1, activation='gelu', spatial_dim=spatial_dim, spatial_multihead=True, spatial_attn_fusion='cond') self.spatial_encoder = layer_repeat(pc_encoder_layer, num_layers) loc_layer = nn.Sequential( nn.Linear(dim_loc, hidden_size), nn.LayerNorm(hidden_size), ) self.loc_layers = layer_repeat(loc_layer, 1) self.pairwise_rel_type = pairwise_rel_type self.spatial_dim = spatial_dim else: pass # # build mixup strategy # self.mixup_strategy = mixup_strategy # self.mixup_function = get_mixup_function(mixup_strategy, mixup_stage1, mixup_stage2) # load weights self.apply(_init_weights_bert) if path is not None: # pre_dict = {} # for name, p in self.named_parameters(): # pre_dict[name] = p # TODO: change this to accelerator loading multiple model files print("loaded") ckpts = glob.glob(os.path.join(path, '*.bin')) if len(ckpts) != 0: for ckpt in ckpts: state_dict = torch.load(ckpt, map_location='cpu') self.load_state_dict(state_dict, strict=False) print("loaded checkpoint files") elif path.endswith('.pth'): print("loaded checkpoint file") state_dict = torch.load(path) self.load_state_dict(state_dict, strict=False) # for name, p in self.named_parameters(): # if name in state_dict.keys(): # print(name, pre_dict[name] - layer_repeat(p, 1)) # exit() def freeze_bn(self, m): for layer in m.modules(): if isinstance(layer, nn.BatchNorm2d): layer.eval() def forward(self, obj_pcds, obj_locs, obj_masks, obj_sem_masks, obj_labels=None, cur_step=None, max_steps=None, **kwargs): if self.freeze: self.freeze_bn(self.point_feature_extractor) # get obj_embdes batch_size, num_objs, _, _ = obj_pcds.size() obj_embeds = self.point_feature_extractor(einops.rearrange(obj_pcds, 'b o p d -> (b o) p d')) # obj_sem_embeds = self.sem_cls_embed_layer(obj_embeds) # obj_sem_embeds = einops.rearrange(obj_sem_embeds, '(b o) d -> b o d', b=batch_size) obj_embeds = einops.rearrange(obj_embeds, '(b o) d -> b o d', b=batch_size) obj_embeds = self.dropout(obj_embeds) if self.freeze: obj_embeds = obj_embeds.detach() # get semantic cls embeds obj_sem_cls = F.softmax(self.point_cls_head(obj_embeds), dim=2).detach() # TODO: check if this sem_cls is still needed, switch this to cross attention # if self.mixup_strategy != None: # obj_sem_cls_mix = self.mixup_function(obj_sem_cls, obj_labels, cur_step, max_steps) # else: # obj_sem_cls_mix = obj_sem_cls # if self.use_matmul_label: # obj_sem_cls_embeds = torch.matmul(obj_sem_cls_mix, self.int2mat) # N, O, 607 matmul ,607, 300 # else: # obj_sem_cls_mix = torch.argmax(obj_sem_cls_mix, dim=2) # obj_sem_cls_embeds = torch.Tensor( # [self.cat2vec[self.int2cat[int(i)]] for i in obj_sem_cls_mix.view(batch_size * num_objs)]) # obj_sem_cls_embeds = obj_sem_cls_embeds.view(batch_size, num_objs, 300).cuda() # obj_sem_cls_embeds = self.sem_cls_embed_layer(obj_sem_cls_embeds) # obj_embeds = obj_embeds + obj_sem_embeds # # get semantic mask embeds # obj_embeds = obj_embeds.masked_fill(obj_sem_masks.unsqueeze(2).logical_not(), 0.0) # obj_sem_mask_embeds = self.sem_mask_embeddings( # torch.zeros((batch_size, num_objs)).long().cuda() # ) * obj_sem_masks.logical_not().unsqueeze(2) # obj_embeds = obj_embeds + obj_sem_mask_embeds # record pre embedes # note: in our implementation, there are three types of embds, raw embeds from PointNet, # pre embeds after tokenization, post embeds after transformers obj_embeds_pre = obj_embeds # spatial reasoning, spatial attention transformer if self.attn_type == 'spatial': pairwise_locs = calc_pairwise_locs(obj_locs[:, :, :3], obj_locs[:, :, 3:], pairwise_rel_type=self.pairwise_rel_type, spatial_dist_norm=True, spatial_dim=self.spatial_dim) for i, pc_layer in enumerate(self.spatial_encoder): query_pos = self.loc_layers[0](obj_locs) obj_embeds = obj_embeds + query_pos obj_embeds, self_attn_matrices = pc_layer(obj_embeds, pairwise_locs, tgt_key_padding_mask=obj_masks.logical_not()) else: pass return obj_embeds, obj_embeds_pre, obj_sem_cls ================================================ FILE: modules/weights.py ================================================ import torch.nn as nn def _init_weights_bert(module, std=0.02): """ Huggingface transformer weight initialization, most commonly for bert initialization """ if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) ================================================ FILE: optim/__init__.py ================================================ from .loss import * ================================================ FILE: optim/build.py ================================================ from optim.loss.loss import Loss from optim.optimizer.optim import get_optimizer from optim.scheduler import get_scheduler def build_optim(cfg, params, total_steps): loss = Loss(cfg) optimizer = get_optimizer(cfg, params) scheduler = get_scheduler(cfg, optimizer, total_steps) return loss, optimizer, scheduler ================================================ FILE: optim/loss/__init__.py ================================================ from .contra_loss import * ================================================ FILE: optim/loss/contra_loss.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from optim.loss.loss import LOSS_REGISTRY from einops import rearrange, einsum from common.dist_utils import all_gather @LOSS_REGISTRY.register() class TextObjWithinBatch(nn.Module): def __init__(self, cfg): super().__init__() self.distributed = cfg.num_gpu > 1 self.bce = cfg.task in ["ScanQA"] def forward(self, data_dict): obj_feats = data_dict["intra_obj_embeds"] # B, O, D text_feats = data_dict["intra_text_embed"] # B, D, feature of CLS token labels = data_dict["tgt_object_id"] # B, 1 masks = data_dict["obj_masks"] # B*L vs. B in per-scene scenario if obj_feats.shape[0] != masks.shape[0]: masks = masks.unsqueeze(1).repeat(1, int(obj_feats.shape[0] / masks.shape[0]), 1).view(-1, masks.shape[1]) labels = labels.view(-1, 1) obj_feats = F.normalize(obj_feats, dim=-1, p=2) text_feats = F.normalize(text_feats, dim=-1, p=2) obj2text_logits = einsum(obj_feats, text_feats, "b o d, b d -> b o") obj2text_logits = obj2text_logits labels = labels.squeeze(-1) if self.bce: loss = F.binary_cross_entropy_with_logits(obj2text_logits, labels.float(), reduction="sum", weight=masks) / float(labels.shape[0]) else: obj2text_logits.masked_fill_(masks.logical_not(), -float('inf')) loss = F.cross_entropy(obj2text_logits, labels) return loss @LOSS_REGISTRY.register() class TextObjBetweenBatch(nn.Module): def __init__(self, cfg): super().__init__() self.distributed = cfg.num_gpu > 1 self.logit_scale = nn.Parameter((torch.ones([]) * np.log(1 / 0.07)).exp()) def forward(self, data_dict): logit_scale = torch.clamp(self.logit_scale, max=100) obj_feats = data_dict["inter_obj_embeds"] # B, O, D text_feats = data_dict["inter_text_embed"] # B, D, feature of CLS token labels = data_dict["tgt_object_id"] # B, 1 if obj_feats.shape[0] != labels.shape[0]: labels = labels.view(-1, 1) tgt_obj_feats = obj_feats[torch.arange(labels.size(0)), labels[:, 0], :] # B, D tgt_obj_feats = F.normalize(tgt_obj_feats, dim=-1, p=2) text_feats = F.normalize(text_feats, dim=-1, p=2) if self.distributed: tgt_obj_feats, text_feats = all_gather([ tgt_obj_feats, text_feats ]) pseudo_labels = torch.arange(text_feats.shape[0]).to(text_feats.device) # B, text2obj_logits = logit_scale * text_feats @ tgt_obj_feats.t() # B, B obj2text_logits = logit_scale * tgt_obj_feats @ text_feats.t() # B, B t2o = F.cross_entropy(text2obj_logits, pseudo_labels) o2t = F.cross_entropy(obj2text_logits, pseudo_labels) loss = (t2o + o2t) / 2 return loss @LOSS_REGISTRY.register() class TextSceneBetweenBatch(nn.Module): def __init__(self, cfg): super().__init__() self.distributed = cfg.num_gpu > 1 self.logit_scale = nn.Parameter((torch.ones([]) * np.log(1 / 0.07)).exp()) def forward(self, data_dict): logit_scale = torch.clamp(self.logit_scale, max=100) scene_feats = data_dict["scene_embed"] # B, O, D text_feats = data_dict["scene_text_embed"] # B, D, feature of CLS token scene_feats = F.normalize(scene_feats, dim=-1, p=2) text_feats = F.normalize(text_feats, dim=-1, p=2) if self.distributed: scene_feats, text_feats = all_gather([ scene_feats, text_feats ]) pseudo_labels = torch.arange(text_feats.shape[0]).to(text_feats.device) # B, text2scene_logits = logit_scale * text_feats @ scene_feats.t() # B, B scene2text_logits = logit_scale * scene_feats @ text_feats.t() # B, B t2s = F.cross_entropy(text2scene_logits, pseudo_labels) s2t = F.cross_entropy(scene2text_logits, pseudo_labels) loss = (t2s + s2t) / 2 return loss if __name__ == "__main__": B, O, D = 32, 10, 512 data_dict = { "obj_embeds": torch.randn(B, O, D), "text_embed": torch.randn(B, D), "labels": torch.randint(0, O, (B, 1)), "obj_masks": torch.ones(B, O).bool(), } from omegaconf import OmegaConf cfg = OmegaConf.create({"num_gpu": 1}) text2obj_loss = TextObjWithinBatch(cfg)(data_dict) obj2text_loss = TextObjBetweenBatch(cfg)(data_dict) ================================================ FILE: optim/loss/loss.py ================================================ import torch.nn as nn import torch.nn.functional as F from fvcore.common.registry import Registry LOSS_REGISTRY = Registry("loss") def og3d_loss(data_dict): return F.cross_entropy(data_dict["og3d_logits"], data_dict["tgt_object_id"].squeeze(1)) def og3d_multi_loss(data_dict): return F.binary_cross_entropy_with_logits( data_dict["og3d_logits"], data_dict["tgt_object_id"].float(), reduction="sum") / float(data_dict["tgt_object_id"].shape[0]) def txt_cls_multi_loss(data_dict): return F.binary_cross_entropy_with_logits( data_dict["txt_cls_logits"], data_dict["tgt_object_label"].float(), reduction='sum') / float(data_dict["tgt_object_label"].shape[0]) def obj_cls_raw_loss(data_dict): return ( F.cross_entropy( data_dict["obj_cls_raw_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] ).sum() / data_dict["obj_masks"].sum() def obj_cls_pre_loss(data_dict): return ( F.cross_entropy( data_dict["obj_cls_pre_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] ).sum() / data_dict["obj_masks"].sum() def obj_cls_post_loss(data_dict): return ( F.cross_entropy( data_dict["obj_cls_post_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] ).sum() / data_dict["obj_masks"].sum() def answer_loss(data_dict): return F.binary_cross_entropy_with_logits( data_dict["answer_scores"], data_dict["answer_label"].float(), reduction='sum' ) / data_dict["answer_scores"].shape[0] def lm_cls_loss(data_dict): target_labels = data_dict["masked_lm_labels"] target_labels = target_labels.view(-1, target_labels.size(-1)) if len(target_labels.size()) == 3 else target_labels return F.cross_entropy( data_dict["txt_lm_cls_logits"].permute(0, 2, 1), target_labels, ignore_index=-1 ) def obj_cls_pre_loss_mask(data_dict): return ( F.cross_entropy( data_dict["obj_cls_pre_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] * data_dict["obj_sem_masks"].logical_not() ).sum() / (data_dict["obj_masks"] * data_dict["obj_sem_masks"].logical_not()).sum() def obj_cls_pre_loss_unmask(data_dict): return ( F.cross_entropy( data_dict["obj_cls_pre_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] * data_dict["obj_sem_masks"] ).sum() / (data_dict["obj_masks"] * data_dict["obj_sem_masks"]).sum() def obj_cls_post_loss_mask(data_dict): return ( F.cross_entropy( data_dict["obj_cls_post_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] * data_dict["obj_sem_masks"].logical_not() ).sum() / (data_dict["obj_masks"] * data_dict["obj_sem_masks"].logical_not()).sum() def obj_cls_post_loss_unmask(data_dict): return ( F.cross_entropy( data_dict["obj_cls_post_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none' ) * data_dict["obj_masks"] * data_dict["obj_sem_masks"] ).sum() / (data_dict["obj_masks"] * data_dict["obj_sem_masks"]).sum() def obj_cls_loss(data_dict, smoothing=0.3): return ( F.cross_entropy( data_dict["obj_logits"].permute(0, 2, 1), data_dict["obj_labels"], reduction='none', label_smoothing=smoothing ) * data_dict["obj_masks"] ).sum() / data_dict["obj_masks"].sum() def mse_loss(data_dict): return ( ((data_dict["pred_images"] - data_dict["target_images"]) ** 2).mean() ) class Loss(nn.Module): def __init__(self, cfg): # e.g. refer_loss_v1: ["og3d_loss", "txt_cls_loss", "obj_cls_raw_loss", "obj_cls_pre_loss", "obj_cls_post_loss"] # qa_loss_v1: ["og3d_loss", "txt_cls_loss", "obj_cls_raw_loss", "obj_cls_pre_loss", "obj_cls_post_loss", "answer_loss"] # pretrain_loss_v1: ["lm_cls_loss", "obj_cls_raw_loss", "obj_cls_pre_loss", "obj_cls_post_loss", "obj_cls_pre_loss_mask", # "obj_cls_pre_loss_unmask", "obj_cls_post_loss_mask", "obj_cls_post_loss_unmask"] super().__init__() self.all_keys = list(set(cfg.model.vis_loss_list + cfg.model.loss_list)) self.selected_keys = cfg.model.loss_list self.loss_fn = {} for k in self.all_keys: if k in globals().keys(): self.loss_fn[k] = globals()[k] print(f"Using {k} from loss.globals()") else: self.loss_fn[k] = LOSS_REGISTRY.get(k)(cfg) setattr(self, k, self.loss_fn[k]) # register the loss module, otherwise its parameters will not be the same device as the model print(f"Using {k} from Registry {LOSS_REGISTRY._name}") def forward(self, data_dict): all_losses = {} for k, fn in self.loss_fn.items(): if k == 'txt_cls_loss' and 'txt_cls_label' not in data_dict: # compatible with old version of txt_cls_loss data_dict['txt_cls_label'] = data_dict["tgt_object_label"].squeeze(1) cur_loss = fn(data_dict) if not isinstance(cur_loss, list): cur_dict_loss = {k : cur_loss} else: cur_dict_loss = {k: cur_loss[0]} for ck, cv in cur_loss[1].items(): cur_dict_loss[k + "_" + ck] = cv all_losses.update(cur_dict_loss) selected_losses = {k: all_losses[k] for k in self.selected_keys} total_loss = sum(selected_losses.values()) all_losses["total_loss"] = total_loss return total_loss, all_losses ================================================ FILE: optim/optimizer/__init__.py ================================================ ================================================ FILE: optim/optimizer/optim.py ================================================ import torch.optim as optim from fvcore.common.registry import Registry OPTIM_REGISTRY = Registry("loss") from common.type_utils import cfg2dict def get_optimizer(cfg, params): if getattr(optim, cfg.solver.optim.name, None) is not None: optimizer = getattr(optim, cfg.solver.optim.name)(params, **cfg2dict(cfg.solver.optim.args)) else: optimizer = OPTIM_REGISTRY.get(cfg.solver.optim.name)(params, **cfg2dict(cfg.solver.optim.args)) return optimizer ================================================ FILE: optim/scheduler.py ================================================ import math from torch.optim.lr_scheduler import LambdaLR def warmup_cosine(step, warmup_step, total_step, minimum_ratio=1e-5): if step <= warmup_step and warmup_step > 0: return step / warmup_step return max( 0.5 * (1 + math.cos((step - warmup_step) / (total_step - warmup_step) * math.pi)), minimum_ratio ) def warmup_exp(step, warmup_step, total_step, **kwargs): if step <= warmup_step and warmup_step > 0: return step / warmup_step return kwargs["gamma"] ** (step * 1. / (total_step - warmup_step)) def get_scheduler(cfg, optimizer, total_steps): warmup_steps = cfg.solver.sched.args.warmup_steps * cfg.num_gpu minimum_ratio = cfg.solver.sched.args.get("minimum_ratio", 1e-5) lambda_func = lambda step: globals()[cfg.solver.sched.name]( step, warmup_steps, total_steps, minimum_ratio=minimum_ratio ) return LambdaLR(optimizer=optimizer, lr_lambda=lambda_func) ================================================ FILE: optim/utils.py ================================================ def no_decay_param_group(parameters, lr): no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] decay_params = [] no_decay_params = [] for n, p in parameters: if p.requires_grad == False: continue if not any(nd in n for nd in no_decay): decay_params.append(p) else: no_decay_params.append(p) optimizer_grouped_parameters = [ {'params': decay_params, 'weight_decay': 0.01, 'lr': lr}, {'params': no_decay_params, 'weight_decay': 0.0, 'lr': lr} ] return optimizer_grouped_parameters ================================================ FILE: preprocess/README.md ================================================ ## Data Processing We have released the preprocessing scripts for 3RScan, MultiScan, ARKitScenes and Structured3D. They are designed to provide a comprehensive framework for data preparation. Taking 3RScan as an example, the process involves the following steps: - Import raw meshes and annotations from each dataset. - Extract vertices from the mesh and assign both instance and semantic labels to them. - Map the dataset-specific semantic labels to ScanNet 607. This is optional for SceneVerse training but may be required for closed-vocab training ([example](https://github.com/scene-verse/SceneVerse/blob/b936f96b61614bec32282e5eed7de844d1a7a330/preprocess/rscan.py#L58)). - Axis Alignment: Rotate the 3D point clouds so that most 3D object bounding boxes are axis-aligned. This follows ScanRefer, and is currently implemented as a heuristic search ([example](https://github.com/scene-verse/SceneVerse/blob/b936f96b61614bec32282e5eed7de844d1a7a330/preprocess/rscan.py#L95)). - Translation Alignment: Translate the 3D point clouds so that its origin at the center on the floor ([example](https://github.com/scene-verse/SceneVerse/blob/b936f96b61614bec32282e5eed7de844d1a7a330/preprocess/rscan.py#L102)). - Color Alignment: The color value should be within the [0, 255] range ([example](https://github.com/scene-verse/SceneVerse/blob/b936f96b61614bec32282e5eed7de844d1a7a330/preprocess/rscan.py#L98)). - Point subsampling: subsample the point clouds if the number of points exceeds 240K. ```python PTS_LIMIT = 240000 if out_points.shape[0] > PTS_LIMIT: pcd_idxs = np.random.choice(out_points.shape[0], size=PTS_LIMIT, replace=False) out_points = out_points[pcd_idxs] out_colors = out_colors[pcd_idxs] instance_labels = instance_labels[pcd_idxs] ``` The detailed steps may vary between datasets. Please note the translation and color alignment are critical as they can significantly impact performance. Axis alignment, which requires 3D bounding box annotations, may result in slight but not severe degradation. ### 3RScan To reproduce the data preprocessing, download [3RScan](https://waldjohannau.github.io/RIO/) and run: ```shell # Preprocess 3RScan $ python rscan.py ``` Adjust the `data_root`, `save_root` and `num_workers` accordingly. ### HM3D As some of our users requested the mapping between HM3D object id in SceneVerse to HM3D-semantics, we have added an additional file ([HM3D_tgtID2objID.zip](assets/HM3D_tgtID2objID.zip)) to obtain this mapping. The json file for each scene contains a dictionary of ```{:[hm3d_objid, hm3d_label]}```. * Note: The script ```sceneverse2hmsemantic.py``` has been deprecated as it cannot reproduce the mappings above. It currently points out how we read the semantics from the annotations in HM3D-semantics. ## Prepare for your custom datasets To prepare your custom data for inference, follow the previous steps and the example script for 3RScan. A convenient way for verification is to use the `visualize_data.py`. If everything is correct, you should observe the colored point clouds displayed similarly to those in the released version of SceneVerse. ## Scene Graph Generation We also release the [scripts](preprocess/ssg/README.md) for 3D scene graph generation. ================================================ FILE: preprocess/__init__.py ================================================ from .build import * from .utils import * from .rscan import * from .multiscan import * from .arkitscenes import * ================================================ FILE: preprocess/arkitscenes.py ================================================ import json from glob import glob from omegaconf import OmegaConf from joblib import Parallel, delayed, parallel_backend import torch import numpy as np import trimesh from tqdm import tqdm from scipy.spatial.transform import Rotation from preprocess.build import ProcessorBase from preprocess.utils.label_convert import ARKITSCENE_SCANNET as label_convert from preprocess.utils.align_utils import compute_box_3d, calc_align_matrix, rotate_z_axis_by_degrees from preprocess.utils.constant import * class ARKitScenesProcessor(ProcessorBase): def record_splits(self, scan_ids): split_dir = self.save_root / 'split' split_dir.mkdir(exist_ok=True) if (split_dir / 'train_split.txt').exists() and (split_dir / 'val_split.txt').exists(): return split = { 'train': [], 'val': []} split['train'] = [scan_id[1] for scan_id in scan_ids if scan_id[0] == 'Training'] split['val'] = [scan_id[1] for scan_id in scan_ids if scan_id[0] == 'Validation'] for _s, _c in split.items(): with open(split_dir / f'{_s}_split.txt', 'w', encoding='utf-8') as fp: fp.write('\n'.join(_c)) def read_all_scans(self): scan_ids = [] for split in ['Training', 'Validation']: scan_paths = glob(str(self.data_root) + f'/{split}/*') scan_ids.extend([(split, path.split('/')[-1]) for path in scan_paths]) return scan_ids def process_point_cloud(self, scan_id, plydata, annotations): vertices = plydata.vertices vertex_colors = plydata.visual.vertex_colors vertex_colors = vertex_colors[:, :3] vertex_instance = np.zeros((vertices.shape[0])) inst_to_label = {} bbox_list = [] for _i, label_info in enumerate(annotations["data"]): obj_label = label_info["label"] object_id = _i + 1 rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3) r = Rotation.from_matrix(rotation) transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3) scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3) trns = np.eye(4) trns[0:3, 3] = transform trns[0:3, 0:3] = rotation.T box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns) obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices)) vertex_instance[obj_containment] = object_id inst_to_label[object_id] = label_convert[obj_label] box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) bbox_list.append(box3d) if len(bbox_list) == 0: return align_angle = calc_align_matrix(bbox_list) vertices = rotate_z_axis_by_degrees(np.array(vertices), align_angle) if np.max(vertex_colors) <= 1: vertex_colors = vertex_colors * 255.0 center_points = np.mean(vertices, axis=0) center_points[2] = np.min(vertices[:, 2]) vertices = vertices - center_points assert vertex_colors.shape == vertices.shape assert vertex_colors.shape[0] == vertex_instance.shape[0] if self.check_key(self.output.pcd): torch.save(inst_to_label, self.inst2label_path / f"{scan_id}.pth") torch.save((vertices, vertex_colors, vertex_instance), self.pcd_path / f"{scan_id}.pth") np.save(self.pcd_path / f"{scan_id}_align_angle.npy", align_angle) def scene_proc(self, scan_id): split = scan_id[0] scan_id = scan_id[1] data_root = self.data_root / split / scan_id if not (data_root / f'{scan_id}_3dod_mesh.ply').exists(): return if not (data_root / f'{scan_id}_3dod_annotation.json').exists(): return plydata = trimesh.load(data_root / f'{scan_id}_3dod_mesh.ply', process=False) with open((data_root / f'{scan_id}_3dod_annotation.json'), "r", encoding='utf-8') as f: annotations = json.load(f) # process point cloud self.process_point_cloud(scan_id, plydata, annotations) def process_scans(self): scan_ids = self.read_all_scans() self.log_starting_info(len(scan_ids)) if self.num_workers > 1: with parallel_backend('multiprocessing', n_jobs=self.num_workers): Parallel()(delayed(self.scene_proc)(scan_id) for scan_id in tqdm(scan_ids)) else: for scan_id in tqdm(scan_ids): self.scene_proc(scan_id) if __name__ == '__main__': cfg = OmegaConf.create({ 'data_root': '/path/to/ARKitScenes', 'save_root': '/output/path/to/ARKitScenes', 'num_workers': 1, 'output': { 'pcd': True, } }) processor = ARKitScenesProcessor(cfg) processor.process_scans() ================================================ FILE: preprocess/build.py ================================================ from pathlib import Path from fvcore.common.registry import Registry PROCESSOR_REGISTRY = Registry("Processor") class ProcessorBase: def __init__(self, cfg): self.data_root = Path(cfg.data_root) self.save_root = Path(cfg.save_root) if cfg.get('save_root', None) else self.data_root.parent / 'scan_data' self.num_workers = cfg.num_workers self.inst2label_path = self.save_root / 'scan_data' / 'instance_id_to_label' self.pcd_path = self.save_root / 'scan_data' / 'pcd_with_global_alignment' self.segm_path = self.save_root / 'scan_data' / 'segm' self.obj_path = self.save_root / 'scan_data' / 'obj' self.sp_path = self.save_root / 'scan_data' / 'super_points' self.output = cfg.output self.setup_directories() def setup_directories(self): if self.check_key(self.output.pcd): self.inst2label_path.mkdir(parents=True, exist_ok=True) self.pcd_path.mkdir(parents=True, exist_ok=True) def log_starting_info(self, scan_len, e=''): print('='*50) print(f'Preprocessing in {self.__class__.__name__} with {scan_len} scans') o = [str(i) for i in self.output if i] assert len(o) > 0, 'Please specify at least one output type' print(f"Output: {', '.join(o)}") if len(e) > 0: print(e) print('='*50) @staticmethod def check_key(key): exist = key is not None if not exist: return False if isinstance(key, bool): enabled = key elif isinstance(key, dict): enabled = key.get('enabled', True) elif hasattr(key, 'enabled'): enabled = key.get('enabled') else: enabled = True return enabled ================================================ FILE: preprocess/multiscan.py ================================================ import re import json from glob import glob from omegaconf import OmegaConf from joblib import Parallel, delayed, parallel_backend import torch from plyfile import PlyData import numpy as np import pandas as pd from tqdm import tqdm from preprocess.build import ProcessorBase from preprocess.utils.label_convert import MULTISCAN_SCANNET as label_convert from preprocess.utils.constant import * class MultiScanProcessor(ProcessorBase): def record_splits(self, scan_ids, ratio=0.8): split_dir = self.save_root / 'split' split_dir.mkdir(exist_ok=True) if (split_dir / 'train_split.txt').exists() and (split_dir / 'val_split.txt').exists(): return scan_len = len(scan_ids) split = { 'train': [], 'val': []} cur_split = 'train' for scan_id in tqdm(sorted(scan_ids)): split[cur_split].append(scan_id) if len(split['train']) > ratio*scan_len: cur_split = 'val' for _s, _c in split.items(): with open(split_dir / f'{_s}_split.txt', 'w', encoding='utf-8') as fp: fp.write('\n'.join(_c)) def read_all_scans(self): scan_paths = glob(str(self.data_root) + '/*') scans_df = [] for scan_path in scan_paths: scan_id = re.findall(r"scene\_[0-9]{5}\_[0-9]{2}", scan_path)[0] scene_id = '_'.join(scan_id.split('_')[:-1]) row = pd.DataFrame([[scene_id, scan_id, scan_path]], columns=['sceneId', 'scanId', 'scanPath']) scans_df.append(row) scans_df = pd.concat(scans_df) return scans_df def process_point_cloud(self, scan_id, plydata, annotations): inst_to_label = {} _x = np.asarray(plydata['vertex']['x']) _y = np.asarray(plydata['vertex']['y']) _z = np.asarray(plydata['vertex']['z']) _nx = np.asarray(plydata['vertex']['nx']) _ny = np.asarray(plydata['vertex']['ny']) _nz = np.asarray(plydata['vertex']['nz']) _red = plydata['vertex']['red'].astype('float64') _green = plydata['vertex']['green'].astype('float64') _blue = plydata['vertex']['blue'].astype('float64') vertices = np.column_stack((_x, _y, _z)) vertex_colors = np.column_stack((_red, _green, _blue)) vertex_instance = np.zeros((vertices.shape[0])) triangles = np.vstack(plydata['face'].data['vertex_indices']) object_ids = plydata['face'].data['objectId'] part_ids = plydata['face'].data['partId'] semseg_df = pd.DataFrame({'objectId': object_ids, 'partId': part_ids}) df = self.annotations_to_dataframe_obj(annotations) for _, row in df.iterrows(): object_id = row['objectId'] assert object_id > 0, f"object id should be greater than 0, but got {object_id}" object_label = row['objectLabel'].split('.')[0] object_label_sn607 = label_convert[object_label] condition1 = semseg_df['objectId'] == object_id tri_indices = semseg_df[condition1].index.values object_vertices = np.unique(triangles[tri_indices]) vertex_instance[object_vertices] = object_id inst_to_label[object_id] = object_label_sn607 if np.max(vertex_colors) <= 1: vertex_colors = vertex_colors * 255.0 center_points = np.mean(vertices, axis=0) center_points[2] = np.min(vertices[:, 2]) vertices = vertices - center_points assert vertex_colors.shape == vertices.shape assert vertex_colors.shape[0] == vertex_instance.shape[0] if self.check_key(self.output.pcd): torch.save(inst_to_label, self.inst2label_path / f"{scan_id}.pth") torch.save((vertices, vertex_colors, vertex_instance), self.pcd_path / f"{scan_id}.pth") @staticmethod def annotations_to_dataframe_obj(annotations): objects = annotations['objects'] df_list = [] for obj in objects: object_id = obj['objectId'] object_label = obj['label'] df_row = pd.DataFrame( [[object_id, object_label]], columns=['objectId', 'objectLabel'] ) df_list.append(df_row) df = pd.concat(df_list) return df def scene_proc(self, scan_id): data_root = self.data_root / scan_id plydata = PlyData.read(data_root / f'{scan_id}.ply') with open((data_root / f'{scan_id}.annotations.json'), "r", encoding='utf-8') as f: annotations = json.load(f) # process point cloud self.process_point_cloud(scan_id, plydata, annotations) def process_scans(self): scans_df = self.read_all_scans() scan_ids = scans_df['scanId'].unique() self.log_starting_info(len(scan_ids)) if self.num_workers > 1: with parallel_backend('multiprocessing', n_jobs=self.num_workers): Parallel()(delayed(self.scene_proc)(scan_id) for scan_id in tqdm(scan_ids)) else: for scan_id in tqdm(scan_ids): print(scan_id) self.scene_proc(scan_id) if __name__ == '__main__': cfg = OmegaConf.create({ 'data_root': '/path/to/MultiScan', 'save_root': '/output/path/to/MultiScan', 'num_workers': 1, 'output': { 'pcd': True, } }) processor = MultiScanProcessor(cfg) processor.process_scans() ================================================ FILE: preprocess/rscan.py ================================================ import json from glob import glob from omegaconf import OmegaConf from joblib import Parallel, delayed, parallel_backend import torch import numpy as np import trimesh import open3d as o3d from tqdm import tqdm from preprocess.build import ProcessorBase from preprocess.utils.label_convert import RSCAN_SCANNET as label_convert from preprocess.utils.align_utils import compute_box_3d, calc_align_matrix, rotate_z_axis_by_degrees from preprocess.utils.constant import * class RScanProcessor(ProcessorBase): def record_splits(self, scan_ids, ratio=0.8): split_dir = self.save_root / 'split' split_dir.mkdir(exist_ok=True) if (split_dir / 'train_split.txt').exists() and (split_dir / 'val_split.txt').exists(): return scan_len = len(scan_ids) split = { 'train': [], 'val': []} cur_split = 'train' for scan_id in tqdm(sorted(scan_ids)): split[cur_split].append(scan_id) if len(split['train']) > ratio*scan_len: cur_split = 'val' for _s, _c in split.items(): with open(split_dir / f'{_s}_split.txt', 'w', encoding='utf-8') as fp: fp.write('\n'.join(_c)) def read_all_scans(self): scan_paths = glob(str(self.data_root) + '/*') scan_ids = [path.split('/')[-1] for path in scan_paths] return scan_ids def process_point_cloud(self, scan_id, plydata, annotations): plylabel, segments, aggregation = annotations vertices = plydata.vertices vertex_colors = trimesh.visual.uv_to_color(plydata.visual.uv, plydata.visual.material.image) vertex_colors = vertex_colors[:, :3] / 255.0 none_list = list() seg_to_inst = {} # segment id to object id inst_to_label = {} # object id to label name seg_indices = segments['segIndices'] seg_group = aggregation['segGroups'] bbox_list = [] for i, _ in enumerate(seg_group): if seg_group[i]['label'] not in label_convert: none_list.append(seg_group[i]['label']) continue inst_to_label[seg_group[i]['id']] = label_convert[seg_group[i]['label']] rotation = np.array(seg_group[i]["obb"]["normalizedAxes"]).reshape(3, 3) transform = np.array(seg_group[i]["obb"]["centroid"]).reshape(-1, 3) scale = np.array(seg_group[i]["obb"]["axesLengths"]).reshape(-1, 3) trns = np.eye(4) trns[0:3, 3] = transform trns[0:3, 0:3] = rotation.T box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) bbox_list.append(box3d) for j in seg_group[i]['segments']: seg_to_inst[j] = seg_group[i]['id'] assert seg_group[i]['id'] == seg_group[i]['objectId'] assert seg_group[i]['id'] > 0 query_points = vertices pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(np.array(plylabel.vertices, dtype=np.float64)) tree = o3d.geometry.KDTreeFlann(pcd) out_instance = [] for i, _ in enumerate(query_points): point = query_points[i] [k, idx, distance] = tree.search_radius_vector_3d(point,0.1) if k == 0: out_instance.append(-1) else: nn_idx = idx[0] if seg_indices[nn_idx] not in seg_to_inst.keys(): out_instance.append(-1) else: out_instance.append(seg_to_inst[seg_indices[nn_idx]]) # alignment: axis-aligned rotation align_angle = calc_align_matrix(bbox_list) vertices = rotate_z_axis_by_degrees(np.array(vertices), align_angle) # alignment: color range if np.max(vertex_colors) <= 1: vertex_colors = vertex_colors * 255.0 # alignment: translation center_points = np.mean(vertices, axis=0) center_points[2] = np.min(vertices[:, 2]) vertices= vertices - center_points vertex_instance = np.array(out_instance) assert vertex_colors.shape == vertices.shape assert vertex_colors.shape[0] == vertex_instance.shape[0] if self.check_key(self.output.pcd): torch.save(inst_to_label, self.inst2label_path / f"{scan_id}.pth") torch.save((vertices, vertex_colors, vertex_instance), self.pcd_path / f"{scan_id}.pth") np.save(self.pcd_path / f"{scan_id}_align_angle.npy", align_angle) def scene_proc(self, scan_id): data_root = self.data_root / scan_id plydata = trimesh.load(data_root / 'mesh.refined.v2.obj', process=False) if not (data_root / 'labels.instances.annotated.v2.ply').exists(): return plylabel = trimesh.load(data_root / 'labels.instances.annotated.v2.ply', process=False) with open((data_root / 'mesh.refined.0.010000.segs.v2.json'), "r", encoding='utf-8') as f: segments = json.load(f) with open((data_root / 'semseg.v2.json'), "r", encoding='utf-8') as f: aggregation = json.load(f) # process point cloud self.process_point_cloud(scan_id, plydata, (plylabel, segments, aggregation)) def process_scans(self): scan_ids = self.read_all_scans() self.log_starting_info(len(scan_ids)) if self.num_workers > 1: with parallel_backend('multiprocessing', n_jobs=self.num_workers): Parallel()(delayed(self.scene_proc)(scan_id) for scan_id in tqdm(scan_ids)) else: for scan_id in tqdm(scan_ids): self.scene_proc(scan_id) if __name__ == '__main__': cfg = OmegaConf.create({ 'data_root': '/path/to/3RScan', 'save_root': '/output/path/to/3RScan', 'num_workers': 1, 'output': { 'pcd': True, } }) processor = RScanProcessor(cfg) processor.process_scans() ================================================ FILE: preprocess/sceneverse2hmsemantic.py ================================================ import os import json from joblib import Parallel, delayed, parallel_backend from glob import glob from tqdm import tqdm import numpy as np import argparse def load_semantic_anno(semantic_txt): semantic_color = [] obj_name_list = [] color_2_name = {} color_2_id = {} with open(semantic_txt) as f: lines = f.readlines()[1:] for line in lines: obj_id = int(line.split(',')[0]) color_str = line.split(',')[1] if len(color_str) != 6: color_str = '0' * (6 - len(color_str)) + color_str r = int(color_str[0:2], 16) g = int(color_str[2:4], 16) b = int(color_str[4:6], 16) obj_name = line.split(',')[2][1:-1] obj_name_list.append(obj_name) rgb_value = np.array([r, g, b], dtype=np.uint8).reshape(1, 3) semantic_color.append(rgb_value) color_2_name[(r, g, b)] = obj_name color_2_id[(r, g, b)] = obj_id return np.concatenate(semantic_color, axis=0), obj_name_list, color_2_name, color_2_id def scene_proc(scene_input): scene_name = scene_input.split('/')[-1] scene_uid = scene_name.split('-')[1] sem_dir = scene_input + '/' + scene_uid + '.semantic' print(scene_name) # load obj semantics anno semantic_anno_color, obj_name_list, color_2_name, color_2_id = load_semantic_anno(sem_dir+'.txt') tgt_id2obj_id = {} # obj assignment and export semantic_anno_set = set(list(zip(*(semantic_anno_color.T)))) for _i, sem in enumerate(tqdm(semantic_anno_set)): obj_name = color_2_name[(sem[0], sem[1], sem[2])] obj_id = color_2_id[(sem[0], sem[1], sem[2])] tgt_id2obj_id[_i+1] = (obj_id, obj_name) json.dump(tgt_id2obj_id, open(os.path.join(scene_input, 'tgt_id2obj_id.json'), 'w'), indent=4) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--data_root', type=str, default='./hm3d-train-annots', help='data root for hm-semantics data') args = parser.parse_args() scene_list = glob(args.data_root + '/*') with parallel_backend('multiprocessing', n_jobs=1): Parallel()(delayed(scene_proc)(scene) for scene in scene_list) ================================================ FILE: preprocess/ssg/README.md ================================================ ## Scene Graph Generation We have released the scripts to generate 3D scene graphs for the datasets released in SceneVerse. ### Example Usage Construct the following OmegaConfig and run ```python ssg_main.py``` ```python cfg = OmegaConf.create({ 'dataset': 'MultiScan', 'scene_path': 'path/to/SceneVerse', 'rels_save_path': './tmp', 'visualize': True, 'num_workers': 1, }) ``` Note that the current implementation of scene graph generation assumes a default viewing direction of "+y" from outside the 3D scan. Therefore, it can be adapted for situated understanding by allowing manual presetting of the position and viewing direction. ================================================ FILE: preprocess/ssg/relationships/camera.py ================================================ import numpy as np import ssg_utils as utils def getLinearEquation(p1x, p1y, p2x, p2y): sign = 1 a = p2y - p1y if a < 0: sign = -1 a = sign * a b = sign * (p1x - p2x) c = sign * (p1y * p2x - p1x * p2y) return [a, b, c] def cal_glocal_position(object, floor, distance_rate=1.6): tgt_pos = object.position room_pos = floor.position room_rect = floor.bottom_rect # center center_dis = utils.euclideanDistance(tgt_pos, room_pos, 2) if center_dis < distance_rate: return 'in the center' # corner for point in room_rect: if utils.euclideanDistance(tgt_pos, point, 2) < distance_rate: return 'in the corner' return None def cal_camera_relations(ObjNode_dict, camera_position, camera_view, inst_dict, floor_idx, fov = 60): relationships = [] for obj_id in ObjNode_dict: if ObjNode_dict[obj_id].label == 'floor': continue # camera relation obj_position = ObjNode_dict[obj_id].position vector = obj_position - camera_position vector = vector / np.linalg.norm(vector) angle = utils.get_theta(vector, camera_view) a, b, c = getLinearEquation(camera_view[0]+camera_position[0], camera_view[1]+camera_position[1], camera_position[0], camera_position[1]) if abs(angle) < fov/2: rela = 'in front of' elif abs(angle) > 180 - fov/2: rela = 'behind' elif a*obj_position[0] + b*obj_position[1] + c > 0: rela = 'right' if camera_view[1] > 0 else 'left' else: rela = 'left' if camera_view[1] > 0 else 'right' relationships.append(['-1', obj_id, rela]) # global relation if inst_dict[ObjNode_dict[obj_id].label] > 1: rela = cal_glocal_position(ObjNode_dict[obj_id], ObjNode_dict[floor_idx]) if rela is not None: # print(ObjNode_dict[obj_id].label, rela) # ObjNode_dict[obj_id].display_obb_box() relationships.append([obj_id, obj_id, rela]) return relationships ================================================ FILE: preprocess/ssg/relationships/hanging.py ================================================ import ssg_utils as utils def cal_above_below_relationships(ObjNode_dict, src, scene_high): above_below_relationships = [] rect = src.bottom_rect src_max = src.z_max src_min = src.z_min src_pos = src.position for tgt_id in ObjNode_dict: tgt = ObjNode_dict[tgt_id] if tgt.label == 'floor' : continue tgt_max = tgt.z_max tgt_min = tgt.z_min tgt_pos = tgt.position tgt_rect = tgt.bottom_rect if utils.euclideanDistance (tgt.position, src.position, 2) < scene_high * 0.85: # make sure in same room # above if src_min > tgt_max and ( utils.if_inPoly(rect, tgt_pos) or utils.if_inPoly(tgt_rect, src_pos) ) : above_below_relationships.extend(utils.generate_relation(src.id, tgt_id, 'high')) return above_below_relationships def filter_labels(obj_label): no_hanging_labels = ['floor', 'table', 'chair', 'desk', 'bottle'] for l in no_hanging_labels: if l in obj_label:return False return True def cal_hanging_relationships (ObjNode_dict, no_supported_objs, camera_angle,scene_high, dataset='scannet'): hanging_relationships = [] for obj_id in ObjNode_dict: if obj_id not in no_supported_objs: obj = ObjNode_dict[obj_id] if not filter_labels(obj.label): continue desp = utils.generate_relation(obj.id, -2, 'hang') if 'tv' in obj.label: desp[2] = 'mounted on' if 'mirror' in obj.label: desp[2] = 'affixed to' hanging_relationships.append(desp) hanging_relationships.extend(cal_above_below_relationships(ObjNode_dict, obj, scene_high)) return hanging_relationships ================================================ FILE: preprocess/ssg/relationships/init.py ================================================ ================================================ FILE: preprocess/ssg/relationships/multi_objs.py ================================================ import numpy as np import networkx as nx import itertools import ssg_utils as utils def are_furniture_aligned(furniture1, furniture2, offset_threshold): x1, y1, z1 = furniture1['center'] x2, y2, z2 = furniture2['center'] h1 = furniture1['size'] h2 = furniture2['size'] rect1 = furniture1['rect'] rect2 = furniture2['rect'] # x_offset x_offset = abs(x1 - x2) # y_offset y_offset = abs(y1 - y2) # z_offset z_offset = abs(z1 - z2) # volumn volumn_diff = abs(utils.get_Poly_Area(rect1) - utils.get_Poly_Area(rect2)) if volumn_diff > offset_threshold: return False if z_offset > offset_threshold: return False if x_offset > offset_threshold and y_offset > offset_threshold: return False if x_offset < offset_threshold: return 'x' if y_offset < offset_threshold: return 'y' def find_aligned_furniture(furniture_list, ObjNode_dict, offset_threshold): aligned_furniture = [] for i, object_id1 in enumerate(furniture_list): obj1 = ObjNode_dict[object_id1] furniture1 = {'center': np.array(obj1.position), 'size': obj1.z_max - obj1.z_min, 'rect': obj1.bottom_rect} for j, object_id2 in enumerate(furniture_list[i+1:]): obj2 = ObjNode_dict[object_id2] furniture2 = {'center': np.array(obj2.position), 'size': obj2.z_max - obj2.z_min, 'rect': obj2.bottom_rect} is_aligned = are_furniture_aligned(furniture1, furniture2, offset_threshold) if is_aligned: aligned_group = [obj1.id, obj2.id, is_aligned] aligned_furniture.append(aligned_group) aligned_furniture_merge = furniture_merge_lists(aligned_furniture) return aligned_furniture_merge def furniture_merge_lists(lists): merged_lists = [] x_list = [lst[:2] for lst in lists if 'x' in lst] y_list = [lst[:2] for lst in lists if 'y' in lst] merged_x_list = merge_sublists(x_list) merged_y_list = merge_sublists(y_list) merged_lists.extend(merged_x_list) merged_lists.extend(merged_y_list) return merged_lists def merge_sublists(L): length = len(L) for i in range(1, length): for j in range(i): if 0 in L[i] or 0 in L[j]: continue x = set(L[i]).union(set(L[j])) y = len(L[i]) + len(L[j]) if len(x) < y: L[i] = list(x) L[j] = [0] return [i for i in L if 0 not in i] def find_middle_furniture (proximity_relations, ObjNode_dict): # in the middle of middle_relationships = [] G = nx.DiGraph() for (src, tgt, rel) in proximity_relations: G.add_edge(src, tgt, label=rel) edage_dict = G.edges.data()._adjdict for src_id in ObjNode_dict: if src_id not in edage_dict: continue if ObjNode_dict[src_id].label == 'floor' :continue neighbors = edage_dict[src_id] tgt_ids = list(neighbors.keys()) combinations = list(itertools.combinations(tgt_ids, 2)) for group in combinations: idx1, idx2 = group if 'near' in neighbors[idx1]['label'] and 'near' in neighbors[idx2]['label']: direction1 = int(neighbors[idx1]['label'].split(' ')[0]) direction2 = int(neighbors[idx2]['label'].split(' ')[0]) if abs(direction1 - direction2) == 6: middle_relationships.append([[src_id,idx1,idx2], 'in the middle of']) return middle_relationships if __name__ == '__main__': # UnitTest lists = [['26', '36', 'x'], ['26', '30', 'x'], ['29', '28', 'y'], ['29', '30', 'y'], ['28', '30', 'y'], ['28', '33', 'x'], ['35', '36', 'y'], ['35', '32', 'y'], ['35', '33', 'y'], ['31', '37', 'x'], ['2', '4', 'y'], ['2', '3', 'y'], ['34', '32', 'y'], ['34', '33', 'y'], ['37', '3', 'x'], ['36', '30', 'x'], ['4', '3', 'y'], ['32', '33', 'y']] output = furniture_merge_lists(lists) print(output) ================================================ FILE: preprocess/ssg/relationships/proximity.py ================================================ import numpy as np import itertools import ssg_utils as utils def get_direction(src_obj, tgt_obj): sx, sy = src_obj tx, ty = tgt_obj y = np.array((tx - sx, ty - sy)) y = y / np.linalg.norm(y) angle_d = utils.get_theta(y, [1, 0]) direction = round(angle_d / 30) if ty > sy : # tgt is up if direction == 0: return "3" elif direction == 1: return "2" elif direction == 2: return "1" elif direction == 3: return "12" elif direction == 4: return "11" elif direction == 5: return "10" elif direction == 6: return "9" else: if direction == 0: return "3" elif direction == 1: return "4" elif direction == 2: return "5" elif direction == 3: return "6" elif direction == 4: return "7" elif direction == 5: return "8" elif direction == 6: return "9" def get_oppo_direction(direction): if direction in ['2', '3', '4']: return 'to the left of' elif direction in ['8', '9', '10']: return 'to the right of' elif direction in ['11','12','1']: return 'behind' else: return 'in front of' def get_space_relations(src, tgt): overlap_point = 0 tgt_rect = tgt.bottom_rect for point in tgt_rect: if utils.if_inPoly(src.bottom_rect, point): # have overlap overlap_point += 1 return overlap_point def get_distance(src, tgt): dis_of_center = utils.euclideanDistance(src.position[:2], tgt.position[:2], 2) src_w = utils.euclideanDistance(src.position[:2], src.bottom_rect[0][:2], 2) tgt_w = utils.euclideanDistance(tgt.position[:2], tgt.bottom_rect[0][:2], 2) return dis_of_center > 1.5 * (src_w + tgt_w) def cal_proximity_relationships(neighbor_objs_id, camera_angle, ObjNode_dict, scene_high): proximity_relations = [] relations = '' neighbor_objs_id_list = [i for i in range(len(neighbor_objs_id))] combinations = list(itertools.combinations(neighbor_objs_id_list, 2)) for combination in combinations: src_idx, tgt_idx = combination src = neighbor_objs_id[src_idx] tgt = neighbor_objs_id[tgt_idx] if ObjNode_dict[src].room_id != ObjNode_dict[tgt].room_id: continue # is overlap overlap_points = get_space_relations(src=ObjNode_dict[src], tgt=ObjNode_dict[tgt]) if overlap_points > 0 : # bulid in if overlap_points >=3: relations = 'under' # close to else: relations = 'close to' proximity_relations.append(utils.generate_relation(ObjNode_dict[src].id, ObjNode_dict[tgt].id, relations)) proximity_relations.append(utils.generate_relation(ObjNode_dict[tgt].id, ObjNode_dict[src].id, relations)) else: # direction src_obj_center = ObjNode_dict[src].position tgt_obj_center = ObjNode_dict[tgt].position src_obj_center_new = utils.cw_rotate(src_obj_center, camera_angle) tgt_obj_center_new = utils.cw_rotate(tgt_obj_center, camera_angle) if src_obj_center_new == tgt_obj_center_new: print ('src_obj_center_new == tgt_obj_center_new ', ObjNode_dict[src].id , ObjNode_dict[tgt].id) break direction = get_direction(src_obj_center_new, tgt_obj_center_new) oppo_direction = get_oppo_direction(direction) if get_distance(src=ObjNode_dict[src], tgt=ObjNode_dict[tgt]): relations = direction + ' o‘clock direction far from' else: relations = direction + ' o‘clock direction near' proximity_relations.append([ObjNode_dict[tgt].id, ObjNode_dict[src].id, relations]) if oppo_direction is not None: proximity_relations.append([ObjNode_dict[src].id, ObjNode_dict[tgt].id, oppo_direction]) return proximity_relations ================================================ FILE: preprocess/ssg/relationships/support.py ================================================ from ssg_data.dictionary import always_supported, hanging import ssg_utils as utils def is_supported(target_obj, obj, camera_angle, radius_range = 0.1, threshold_of_z_rate=0.8): z_min = obj.z_min z_max = obj.z_max tz_max = target_obj.z_max tz_min = target_obj.z_min # overlap of z diff_z = z_min - tz_max height = z_max - z_min z_rate = abs(diff_z) / height # must be larger if not utils.get_Poly_Area(target_obj.bottom_rect[:, 0:2]) > utils.get_Poly_Area(obj.bottom_rect[:, 0:2]): return False if target_obj.label == 'floor': if not z_min < tz_max: return False else: # must be higher # if tz_max > z_max: # return False if z_min > (tz_max*0.05 if tz_max > 0 else tz_max*0.95): # floating return False if z_min < tz_min: return False if not diff_z < height*0.2: return False # must be centered center = obj.position if not utils.if_inPoly(target_obj.bottom_rect, center): return False if target_obj.label == 'floor': return 'support_express' else: if z_rate < threshold_of_z_rate : return 'support_express' elif z_rate >= threshold_of_z_rate and z_rate < 0.95: return 'embed_express' else: return 'inside_express' def optimaze_support_loops(support_relations_dict): relationships = [] for obj_id, tgts in support_relations_dict.items(): if len(tgts)>1: positions = [tgt.position[2] for tgt in tgts] hightest_tgt_inedx = positions.index(max(positions)) hightest_tgt = tgts[hightest_tgt_inedx] relationships.append(utils.generate_relation(hightest_tgt.id, obj_id, 'support')) else: relationships.append(utils.generate_relation(tgts[0].id, obj_id, 'support')) return relationships def cal_support_relations(ObjNode_list, camera_angle): support_relations_dict = {} embedded_relationships = [] hanging_objs = {} for target_obj_id in ObjNode_list: target_obj = ObjNode_list[target_obj_id] for obj_id in ObjNode_list: obj = ObjNode_list[obj_id] if target_obj.id == obj.id: continue if target_obj.label in always_supported or obj.label in always_supported: continue if target_obj.label in hanging or obj.label in hanging: continue is_support = is_supported(target_obj, obj, camera_angle) if is_support: if is_support in ['embed_express', 'inside_express']: embedded_relationships.append(utils.generate_relation(target_obj.id, obj.id, is_support)) else: if obj.id not in support_relations_dict: support_relations_dict[obj.id] = [target_obj] else: support_relations_dict[obj.id].append(target_obj) hanging_objs[obj.id] = 1 return optimaze_support_loops(support_relations_dict), embedded_relationships, hanging_objs ================================================ FILE: preprocess/ssg/ssg_data/dictionary.py ================================================ hanging = ['window', 'curtain', 'curtains', 'shower curtain', 'curtain rod', 'shower curtain rod'] always_supported = ['wall', 'wall hanging', 'bath walls', 'closet wall', 'closet walls', 'closet wall', 'closet walls', 'door wall', 'pantry wall', 'pantry walls', 'shower wall', 'shower walls', 'door','sliding door', 'sliding wood door', 'bathroom stall door', 'doors', 'door frame'] component = { 'closet' : ["closet ceiling" ,"closet door","closet doorframe","closet doors" , "closet rod" ,"closet shelf" ], 'cabinet': ['cabinet door', 'cabinet doors'], } added_hanging = { 'curtain rod': ['curtain', ], 'shower curtain rod': ['shower curtain'], } # word diversity support_express = ['support'] opp_support_express = ['resting on', 'placed on', 'on', 'supported by', 'on the top of'] embed_express = [''] opp_embed_express = ['embedded into', 'placed within the area of'] inside_express = [''] opp_inside_express = ['inside', 'placed within the area of'] hanging_express = ['hanging on', 'hung on'] close_express = ['close to', 'adjacent to', 'beside', 'next to'] under_express = ['above'] above_express = ['above', 'higher than'] below_express = ['below', 'lower than'] must_support_scannetpp = ['chair', 'sofa', 'table', 'bookshelf', 'standing lamp', 'shoe', 'backpack', 'bag', 'mat', 'barbell','dumbbell', 'trash bin', 'basket', 'tv stand', 'tablet', 'mop', 'vacum cleaner'] ================================================ FILE: preprocess/ssg/ssg_data/script/ObjNode.py ================================================ import networkx as nx import trimesh import matplotlib.pyplot as plt import numpy as np import pyvista as pv class ObjNode(object): def __init__(self, id=None, label=None, mesh=None, position=None, size=None, children=[], room_id = None,dataset='scannet'): self.id = id self.label = label self.obj_mesh = mesh self.size = size self.position = position self.children = children self.room_id = room_id self.align_matrix, self.position, self.z_min, self.z_max, self.bottom_rect, self.top_rect = self.get_object_information(dataset) def __str__(self): return "[{}:{},{},{}]".format(self.id, self.label, self.position, self.angle) def get_object_information(self, dataset): position = self.position # - bias axis_align_matrix = None x_min = position[0] - self.size[0] / 2 x_max = position[0] + self.size[0] / 2 y_min = position[1] - self.size[1] / 2 y_max = position[1] + self.size[1] / 2 z_min = position[2] - self.size[2] / 2 z_max = position[2] + self.size[2] / 2 top_vertics = np.array([[x_min, y_min, z_min], [x_max, y_min, z_min], [x_max, y_max, z_min], [x_min, y_max, z_min]]) bottom_vertics = np.array([[x_min, y_min, z_max], [x_max, y_min, z_max], [x_max, y_max, z_max], [x_min, y_max, z_max]]) return axis_align_matrix, position, z_min, z_max, bottom_vertics, top_vertics def display_obb_box(self, scene_visible = True): axis_align_matrix = self.align_matrix obj_mesh = trimesh.load(self.obj_mesh) scene_ply = pv.read(self.scan_ply) # rotate to axis align obj_mesh.apply_transform(axis_align_matrix) if self.label == 'floor': scene_mesh = trimesh.load(self.scan_mesh) scene_mesh.apply_transform(axis_align_matrix) # draw aabb tgt_points = np.array(scene_mesh.bounding_box.as_outline().vertices) tgt_edges = np.array(scene_mesh.bounding_box.as_outline().vertex_nodes) tgt_points_new = [] for edge in tgt_edges: tgt_points_new.append(tgt_points[edge[0]]) tgt_points_new.append(tgt_points[edge[1]]) # show results plotter = pv.Plotter(off_screen=False) light = pv.Light(light_type='headlight', intensity=0.2) plotter.add_light(light) plotter.add_mesh(scene_ply.transform(axis_align_matrix), rgb=True) plotter.add_lines(np.array(tgt_points_new), color='red', width=3) else: # draw bbox tgt_points = np.array(obj_mesh.bounding_box_oriented.as_outline().vertices) tgt_edges = np.array(obj_mesh.bounding_box_oriented.as_outline().vertex_nodes) tgt_points_new = [] for edge in tgt_edges: tgt_points_new.append(tgt_points[edge[0]]) tgt_points_new.append(tgt_points[edge[1]]) # draw aabb aa_tgt_points = np.array(obj_mesh.bounding_box.as_outline().vertices) aa_tgt_edges = np.array(obj_mesh.bounding_box.as_outline().vertex_nodes) aa_tgt_points_new = [] for edge in aa_tgt_edges: aa_tgt_points_new.append(aa_tgt_points[edge[0]]) aa_tgt_points_new.append(aa_tgt_points[edge[1]]) # show results plotter = pv.Plotter(off_screen=False) light = pv.Light(light_type='headlight', intensity=0.2) plotter.add_light(light) plotter.add_mesh(scene_ply.transform(axis_align_matrix), rgb=True) plotter.add_lines(np.array(tgt_points_new), color='red', width=3) plotter.add_lines(np.array(aa_tgt_points_new), color='yellow', width=3) plotter.camera.zoom(1.2) plotter.show() if __name__ == '__main__': obj_sample = ObjNode(id=1, label='', size='', mesh='../../DataAnnotation/data/scannet_objs/scene0000_00/45/mesh.obj') G = nx.DiGraph() G.add_node(obj_sample.id, desc = 'here1') G.add_node(obj_sample.id+1, desc = 'here2') G.add_node(obj_sample.id +3, desc = 'here3') G.add_edge(1, 2, name ='support') G.add_edge(2, 1, name='support2') pos = nx.spring_layout(G) nx.draw(G,pos) node_labels = nx.get_node_attributes(G, 'desc') nx.draw_networkx_labels(G, pos, labels=node_labels) edge_labels = nx.get_edge_attributes(G, 'name') nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels) plt.show() ================================================ FILE: preprocess/ssg/ssg_data/ssg_visualize.py ================================================ import open3d as o3d import numpy as np import torch def vis_dataset(ObjNode_dict, relation, scene_path, scan_id, scene_center): coordinate = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.6, origin=[-0, -0, -0]) pcd_data = torch.load(scene_path / 'pcd_with_global_alignment' / f'{scan_id}.pth') points, colors, _ = pcd_data[0], pcd_data[1], pcd_data[-1] o3d_pcd = o3d.geometry.PointCloud() o3d_pcd.points = o3d.utility.Vector3dVector(points) o3d_pcd.colors = o3d.utility.Vector3dVector(colors/255.0) scene_show = [o3d_pcd] np.random.shuffle(relation) for rel in relation: if len(rel) == 3: if rel[1] == -2: src = ObjNode_dict[rel[0]] gt_o3d_box_src = o3d.geometry.OrientedBoundingBox(src.position + scene_center, np.eye(3, 3), src.size) gt_o3d_box_src.color = [0, 1, 0] obj_label = f'''{src.label} {rel[2]}''' bbox_show_list = [gt_o3d_box_src] o3d.visualization.draw_geometries(scene_show + [coordinate] + bbox_show_list, window_name=obj_label) else: src = ObjNode_dict[rel[0]] tgt = ObjNode_dict[rel[1]] gt_o3d_box_src = o3d.geometry.OrientedBoundingBox(src.position + scene_center, np.eye(3, 3), src.size) gt_o3d_box_src.color = [0, 1, 0] gt_o3d_box_tgt = o3d.geometry.OrientedBoundingBox(tgt.position + scene_center, np.eye(3, 3), tgt.size) gt_o3d_box_tgt.color = [1, 0, 0] obj_label = f'''{tgt.label} - {rel[2]} - {src.label} ''' bbox_show_list = [gt_o3d_box_src, gt_o3d_box_tgt] o3d.visualization.draw_geometries(scene_show + [coordinate] + bbox_show_list, window_name=obj_label) else: tgts = [ObjNode_dict[tgt] for tgt in rel[0]] gt_o3d_box_tgts = [o3d.geometry.OrientedBoundingBox(tgt.position + scene_center, np.eye(3, 3), tgt.size) for tgt in tgts] # gt_o3d_box_tgt.color = [1, 0, 0] obj_label = f''' {ObjNode_dict[tgts[0].id].label} {rel[1]}''' bbox_show_list = [gt_o3d_box_tgts] o3d.visualization.draw_geometries(scene_show + [coordinate] + bbox_show_list, window_name=obj_label) ================================================ FILE: preprocess/ssg/ssg_main.py ================================================ import json import pickle from tqdm import tqdm from pathlib import Path from omegaconf import OmegaConf import torch import networkx as nx import numpy as np import ssg_utils as utils from ssg_data import dictionary from ssg_data.ssg_visualize import vis_dataset from ssg_data.script.ObjNode import ObjNode from relationships.support import cal_support_relations from relationships.proximity import cal_proximity_relationships from relationships.hanging import cal_hanging_relationships from relationships.multi_objs import find_aligned_furniture, find_middle_furniture def default_dump(obj): """Convert numpy classes to JSON serializable objects.""" if isinstance(obj, (np.integer, np.floating, np.bool_)): return obj.item() elif isinstance(obj, np.ndarray): return obj.tolist() else: return obj def convert_pc_to_box(obj_pc): xmin = np.min(obj_pc[:,0]) ymin = np.min(obj_pc[:,1]) zmin = np.min(obj_pc[:,2]) xmax = np.max(obj_pc[:,0]) ymax = np.max(obj_pc[:,1]) zmax = np.max(obj_pc[:,2]) center = [(xmin+xmax)/2, (ymin+ymax)/2, (zmin+zmax)/2] box_size = [xmax-xmin, ymax-ymin, zmax-zmin] return center, box_size def init_camera_view(): camera_view = [0, -1, 0] camera_pos = [0, 0, 0] camera_view = camera_view / np.linalg.norm(camera_view) if camera_view[0] < 0: camera_angle = -utils.get_theta(camera_view, [0, 1, 0]) else: camera_angle = utils.get_theta(camera_view, [0, 1, 0]) return camera_view, camera_pos, camera_angle def filter_bad_label(input_label): bad_label_list = ['ceiling', 'wall', 'door', 'doorframe', 'object'] for o in bad_label_list: if o in input_label: return False return True def get_obj_room_id (org_id): infos = org_id.split('|') if infos[1] == 'surface': return int(infos[2]) else: return int(infos[1]) def generate_object_info(save_root, scene_name) : object_json_list = [] inst2label_path = save_root / 'instance_id_to_label' pcd_path = save_root / 'pcd_with_global_alignment' inst_to_label = torch.load(inst2label_path / f"{scene_name}.pth") pcd_data = torch.load(pcd_path / f'{scene_name}.pth') points, colors, instance_labels = pcd_data[0], pcd_data[1], pcd_data[-1] pcds = np.concatenate([points, colors], 1) x_max, y_max, z_max = points.max(axis=0) x_min, y_min, z_min = points.min(axis=0) obj_pcds = [] for i in np.unique(instance_labels): if i < 0: continue mask = instance_labels == i # time consuming obj_pcds.append((pcds[mask], inst_to_label[int(i)], i)) for _, (obj, obj_label, i) in enumerate(obj_pcds): gt_center, gt_size = convert_pc_to_box(obj) object_json = { 'id': int(i), 'label': obj_label, 'position': gt_center, 'size': gt_size, 'mesh': None } object_json_list.append(object_json) # add scan_id object_json_string = { 'scan': scene_name, 'point_max': [x_max, y_max, z_max], 'point_min': [x_min, y_min, z_min], 'object_json_string': object_json_list, 'inst_to_label': inst_to_label, } return object_json_string def generate_ssg_data(dataset, scene_path, pre_load_path): ssg_data = {} pre_load_file_save_path = pre_load_path / (dataset + '.pkl') if pre_load_file_save_path.exists(): print('Using preprocessed scene data') with open(pre_load_file_save_path, 'rb') as f: ssg_data = pickle.load(f) else: print('Preprocessing scene data') scans = [s.stem for s in (scene_path / 'pcd_with_global_alignment').glob('*.pth')] scans.sort() for scan_id in tqdm(scans): object_json_string = generate_object_info(scene_path, scan_id) if object_json_string is not None: ssg_data[scan_id] = object_json_string with open(pre_load_file_save_path, 'wb') as f: pickle.dump(ssg_data, f) return ssg_data def main(cfg): cfg.rels_save_path.mkdir(parents=True, exist_ok=True) ssg_data = generate_ssg_data(cfg.dataset, cfg.scene_path, cfg.rels_save_path) scans_all = list(ssg_data.keys()) ### init camera ### camera_view, camera_pos, camera_angle = init_camera_view() for scan_id in scans_all: objects_save = {} relationship_save = {} inst_dict = {} print('Processing ', scan_id) objects_info = ssg_data[scan_id]['object_json_string'] inst_labels = ssg_data[scan_id]['inst_to_label'] # bad case if len(objects_info) == 0: continue # construct object graph G = nx.DiGraph() # create nodes ObjNode_dict = {} # log objects of the same category for inst in inst_labels: if inst_labels[inst] not in inst_dict: inst_dict[inst_labels[inst]] = 1 else: inst_dict[inst_labels[inst]] += 1 x_max, y_max, z_max = ssg_data[scan_id]['point_max'] x_min, y_min, z_min = ssg_data[scan_id]['point_min'] scene_center = np.array([(x_max + x_min) / 2, (y_max + y_min) / 2, (z_max + z_min) / 2]) # floor bad if z_max == z_min: z_max = z_min + 5 scene_high = z_max - z_min # generate object node for graph obj_z_min = 1000 floor_idx = -100 for obj in objects_info: if np.array(obj['size']).sum() == 0: continue if not filter_bad_label(obj['label']): continue if obj['label'] == 'floor': floor_idx = int(obj['id']) node = ObjNode(id=int(obj['id']), position=obj['position']-scene_center, label=obj['label'], mesh=obj['mesh'] if 'mesh' in obj else None, size=np.array(obj['size']), children=obj['children'] if 'children' in obj else None, room_id=get_obj_room_id (obj['id_org']) if 'id_org' in obj else None, dataset=cfg.dataset) if obj['position'][2] - obj['size'][2]/2 < obj_z_min: obj_z_min = obj['position'][2]-obj['size'][2]/2 obj['count'] = inst_dict[node.label] obj['caption'] = '' ObjNode_dict[int(obj['id'])] = node G.add_node(node.id, label=node.label) # added special nodes (wall camera) G.add_node(-1, label='CAMERA') G.add_node(-2, label='wall') # special node for floor if floor_idx == -100: G.add_node(-3, label='floor') fx, fy, fz = scene_center[0], scene_center[1], obj_z_min node = ObjNode(id=-3, position=np.array([fx, fy, fz]) - scene_center, label='floor', size=[(x_max-x_min)*1.2, (y_max-y_min)*1.2, (z_max-z_min)*0.1], dataset=cfg.dataset) ObjNode_dict[-3] = node floor_idx = -3 else: fx, fy, fz = scene_center[0], scene_center[1], obj_z_min node_ = ObjNode_dict[floor_idx] if node_.size[2] > 0: node = ObjNode(id=floor_idx, position= np.array([fx, fy, fz]) - scene_center, label='floor', size=[max((x_max-x_min)*1.2, node_.size[0]), max((y_max-y_min)*1.2, node_.size[0]), node_.size[2]], dataset=cfg.dataset) else: node = ObjNode(id=floor_idx, position= np.array([fx, fy, fz]) - scene_center, label='floor', size=[max((x_max-x_min)*1.2, node_.size[0]), max((y_max-y_min)*1.2, node_.size[0]), (z_max-z_min)*0.1], dataset=cfg.dataset) ObjNode_dict[floor_idx] = node # support embedded relationship if cfg.dataset.lower() in ['procthor']: support_relations = [] embedded_relations = [] hanging_objs_dict = {} for src_id, _ in ObjNode_dict.items(): src_obj = ObjNode_dict[src_id] if src_obj.z_min <= ObjNode_dict[floor_idx].z_max and src_obj.id != floor_idx: support_relations.append(utils.generate_relation(floor_idx, src_id,'support')) hanging_objs_dict[src_id] = 1 if src_obj.children != []: for child in src_obj.children: hanging_objs_dict[child] = 1 if child not in ObjNode_dict: continue if ObjNode_dict[child].z_max < src_obj.z_max: embedded_relations.append(utils.generate_relation(src_id, child ,'inside_express')) else: support_relations.append(utils.generate_relation(src_id, child , 'support')) else: support_relations, embedded_relations, hanging_objs_dict = cal_support_relations(ObjNode_dict, camera_angle) for rela in support_relations: target_obj_id, obj_id, _ = rela G.add_edge(target_obj_id, obj_id, label='support') # optimizer # hanging relationships hanging_relationships = cal_hanging_relationships(ObjNode_dict, hanging_objs_dict, camera_angle, scene_high, dataset=cfg.dataset) # iterate graph to cal saptial relationships proximity_relations = [] for node in G: neighbor = dict(nx.bfs_successors(G, source=node, depth_limit=1)) if len(neighbor[node]) > 1: neighbor_objs = neighbor[node] proximity = cal_proximity_relationships(neighbor_objs, camera_angle, ObjNode_dict, scene_high) proximity_relations += proximity # added some special relations and oppo support relationships oppo_support_relations = [] objects_rels = support_relations + embedded_relations + hanging_relationships for idx, rels in enumerate(objects_rels): src, tgt, rela = rels if rela == 'support': oppo_support_relations.append(utils.generate_relation(src, tgt, 'oppo_support')) if src == -2 or tgt == -2: continue src_label = ObjNode_dict[src].label tgt_label = ObjNode_dict[tgt].label if src_label in dictionary.added_hanging and dictionary.added_hanging[src_label] == tgt_label: objects_rels[idx][2] = 'hanging' if tgt_label in dictionary.added_hanging and dictionary.added_hanging[tgt_label] == src_label: objects_rels[idx][2] = 'hanging' # multi objects multi_objs_relationships = [] # added aligned relationship furniture_list = list(ObjNode_dict.keys()) aligned_furniture = find_aligned_furniture(furniture_list, ObjNode_dict, 0.065) for _, aligned_furni in enumerate(aligned_furniture): multi_objs_relationships.append([aligned_furni, 'Aligned']) # added in the middle of relationship middle_relationships = find_middle_furniture(proximity_relations, ObjNode_dict) # output json relationships_json_string = { 'scan': scan_id, 'camera_view': camera_view, 'camera_position': camera_pos, 'relationships': objects_rels + proximity_relations + oppo_support_relations, 'multi_objs_relationships': multi_objs_relationships + middle_relationships, } np.random.shuffle(objects_rels) # visualize scene if cfg.visualize: vis_dataset(ObjNode_dict=ObjNode_dict, relation=proximity_relations, scene_path=cfg.scene_path, scan_id=scan_id, scene_center=scene_center) relationship_save[scan_id] = relationships_json_string objects_save[scan_id] = {"objects_info": objects_info, "inst_to_label" : inst_labels} print ('==> DONE') print('SCENE ', scan_id) print('OBJECTS ', len(ObjNode_dict)) scan_path = cfg.rels_save_path / scan_id scan_path.mkdir(parents=True, exist_ok=True) print('SAVE', scan_path) with (scan_path / 'relationships.json').open('w') as file: json.dump(relationship_save, file, default=default_dump) with (scan_path / 'objects.json').open('w') as file: json.dump(objects_save, file, default=default_dump) print ('=====================\n') if __name__ == '__main__': cfg = OmegaConf.create({ 'dataset': 'dataset', 'scene_path': '/path/to/dir', 'rels_save_path': '/output/path/to/dir', 'visualize': True, 'num_workers': 1, }) cfg.scene_path = Path(cfg.scene_path) / cfg.dataset / 'scan_data' cfg.rels_save_path = Path(cfg.rels_save_path) / cfg.dataset main(cfg) ================================================ FILE: preprocess/ssg/ssg_utils.py ================================================ import trimesh import math from shapely import geometry import os import numpy as np import pyvista as pv from ssg_data.dictionary import * import random import open3d as o3d def cw_rotate(point, ang): x,y,_ = point ang = math.radians(ang) new_x = round(x * math.cos(ang) - y * math.sin(ang), 5) new_y = round(x * math.sin(ang) + y * math.cos(ang), 5) return new_x, new_y def euclideanDistance(instance1, instance2, dimension): distance = 0 for i in range(dimension): distance += (instance1[i] - instance2[i])**2 return math.sqrt(distance) def if_inPoly(polygon, Points): line = geometry.LineString(polygon) point = geometry.Point(Points) polygon = geometry.Polygon(line) return polygon.contains(point) def get_Poly_Area(polygon): line = geometry.LineString(polygon) polygon = geometry.Polygon(line) return polygon.area def get_theta (x, y): x = np.array(x) y = np.array(y) l_x = np.sqrt(x.dot(x)) l_y = np.sqrt(y.dot(y)) dian = x.dot(y) cos_ = dian / (l_x * l_y) angle_hu = np.arccos(cos_) angle_d = angle_hu * 180 / np.pi return angle_d def generate_relation(src, tgt, express): if 'oppo_support' in express: oppo_rels = [tgt, src, random.choice(opp_support_express)] return oppo_rels elif 'support' in express: rels = [src, tgt, random.choice(support_express)] return rels elif 'embed' in express: oppo_rels = [tgt, src, random.choice(opp_embed_express)] return oppo_rels elif 'inside' in express: oppo_rels = [tgt, src, random.choice(opp_inside_express)] return oppo_rels elif 'hang' in express: oppo_rels = [src, tgt, random.choice(hanging_express)] return oppo_rels elif 'under' in express: oppo_rels = [src, tgt, random.choice(under_express)] return oppo_rels elif 'close' in express: oppo_rels = [src, tgt, random.choice(close_express)] return oppo_rels elif 'high' in express: rels = [src, tgt, random.choice(above_express)] oppo_rels = [tgt, src, random.choice(below_express)] return [rels,oppo_rels] def visualize_relations(target_obj, obj, relationship, camera_angle, camera_position = np.array([0,0,0]), save = False): if save: render_bbox_pyvista(obj, target_obj, relationship, camera_angle, camera_position) else: axis_align_matrix = target_obj.align_matrix tgt_mesh = trimesh.load(target_obj.obj_mesh) src_mesh = trimesh.load(obj.obj_mesh) tgt_mesh.apply_transform(axis_align_matrix) src_mesh.apply_transform(axis_align_matrix) tgt_p = tgt_mesh.bounding_box.as_outline() tgt_p.entities[0].color = (255, 0, 0, 255) src_p = src_mesh.bounding_box.as_outline() src_p.entities[0].color = (255, 255, 0, 255) scene_mesh = trimesh.load_mesh(target_obj.scan_mesh) scene_mesh.apply_transform(axis_align_matrix) # draw line of two objects lines_of_center = [[np.array(target_obj.position), np.array(obj.position)]] p = trimesh.load_path(lines_of_center) # rotate from camera view camera_rotate = trimesh.transformations.rotation_matrix( np.deg2rad(camera_angle), [0,0,1], point=(0,0,0) ) scene_mesh.apply_transform(camera_rotate) tgt_p.apply_transform(camera_rotate) src_p.apply_transform(camera_rotate) p.apply_transform(camera_rotate) # draw camera center camera = trimesh.primitives.Sphere(radius=0.2, center=camera_position) camera.apply_transform(camera_rotate) Scene = trimesh.Scene() camera_rotate = trimesh.transformations.rotation_matrix( -20, [1,0,0], point=(0,0,0) ) Scene.add_geometry([scene_mesh, src_p, tgt_p, p]) Scene.apply_transform(camera_rotate) Scene.show() def visualize_relations_multi_objs(objs, relationship, item, camera_angle, camera_position = np.array([0,0,0]), save = False): # img save name save_img_name = '_'.join([relationship, objs[0].label]) + str(item) # load mesh scene_mesh = pv.read(objs[0].scan_ply) axis_align_matrix = objs[0].align_matrix tgt_meshs = [trimesh.load(obj.obj_mesh) for obj in objs] # show results plotter = pv.Plotter(off_screen=True) light = pv.Light(light_type='headlight', intensity=0.3) plotter.add_light(light) # draw camera camera_look_at = cw_rotate(camera_position+np.array([0,1,0]), -camera_angle) camera_look_at = np.array([camera_look_at[0], camera_look_at[1], 0]) # plotter.add_lines(np.array([camera_position, camera_look_at]), color='blue', width=3) mesh = pv.Arrow(start=camera_position, direction=camera_look_at) plotter.add_mesh(mesh) # added scene mesh plotter.add_mesh(scene_mesh.transform(axis_align_matrix), rgb=True) # rotate to axis align and added in to scene for tgt_mesh in tgt_meshs: tgt_mesh.apply_transform(axis_align_matrix) # draw bbox tgt_points = np.array(tgt_mesh.bounding_box.as_outline().vertices) tgt_edges = np.array(tgt_mesh.bounding_box.as_outline().vertex_nodes) tgt_points_new = [] for edge in tgt_edges: tgt_points_new.append(tgt_points[edge[0]]) tgt_points_new.append(tgt_points[edge[1]]) plotter.add_lines(np.array(tgt_points_new), color='yellow', width=3) plotter.add_point_labels( [np.array(obj.position) for obj in objs], [obj.label for obj in objs], margin=0, fill_shape=True, font_size=18, shape_color="black", point_color="red", text_color="white", always_visible=True, ) plotter.add_text( save_img_name, position='upper_right', color='Blue', shadow=True, font_size=19, ) plotter.camera_position = 'yz' plotter.camera.azimuth = 90 - camera_angle + 180 plotter.camera.elevation = 65 plotter.camera.zoom(1.2) plotter.show() def render_bbox_pyvista(tgt, src, relationship, camera_angle, camera_position): # img save name save_img_name = '_'.join([relationship, src.label, src.id, tgt.label, tgt.id]) # load mesh tgt_mesh = trimesh.load(tgt.obj_mesh) src_mesh = trimesh.load(src.obj_mesh) scene_mesh = pv.read(tgt.scan_ply) axis_align_matrix = tgt.align_matrix # rotate to axis align tgt_mesh.apply_transform(axis_align_matrix) src_mesh.apply_transform(axis_align_matrix) # draw bbox tgt_points = np.array(tgt_mesh.bounding_box.as_outline().vertices) tgt_edges = np.array(tgt_mesh.bounding_box.as_outline().vertex_nodes) tgt_points_new = [] for edge in tgt_edges: tgt_points_new.append(tgt_points[edge[0]]) tgt_points_new.append(tgt_points[edge[1]]) src_points = np.array(src_mesh.bounding_box.as_outline().vertices) src_edges = np.array(src_mesh.bounding_box.as_outline().vertex_nodes) src_points_new = [] for edge in src_edges: src_points_new.append(src_points[edge[0]]) src_points_new.append(src_points[edge[1]]) # show results plotter = pv.Plotter(off_screen=True) light = pv.Light(light_type='headlight', intensity=0.3) plotter.add_light(light) # draw camera camera_look_at = cw_rotate(camera_position+np.array([0,1,0]), -camera_angle) camera_look_at = np.array([camera_look_at[0], camera_look_at[1], 0]) # plotter.add_lines(np.array([camera_position, camera_look_at]), color='blue', width=3) mesh = pv.Arrow(start=camera_position, direction=camera_look_at) plotter.add_mesh(mesh) plotter.add_mesh(scene_mesh.transform(axis_align_matrix), rgb=True) plotter.add_lines(np.array([src.position, tgt.position]), color='red', width=3) plotter.add_lines(np.array(src_points_new), color='red', width=3) plotter.add_lines(np.array(tgt_points_new), color='yellow', width=3) # plotter.add_axes_at_origin() plotter.add_point_labels( [ src.position, tgt.position, camera_position ], [src.label, tgt.label, 'Camera View'], margin=0, fill_shape=True, font_size=18, shape_color="black", point_color="red", text_color="white", always_visible=True, ) plotter.add_text( save_img_name, position='upper_right', color='Blue', shadow=True, font_size=19, ) plotter.camera_position = 'yz' plotter.camera.azimuth = 90 - camera_angle + 180 plotter.camera.elevation = 65 plotter.camera.zoom(1.2) plotter.show() def visualize_camera_relations(ObjNode_dict, camera_relations, camera_position, camera_view, save = False): tgt = ObjNode_dict[camera_relations[0][1]] scene_mesh = trimesh.load(tgt.scan_mesh) axis_align_matrix = tgt.align_matrix objs_mesh = [] for rela in camera_relations: _, obj, desc = rela obj = ObjNode_dict[obj] src_mesh = trimesh.load(obj.obj_mesh) src_mesh.apply_transform(axis_align_matrix) src_p = src_mesh.bounding_box.as_outline() if desc == 'behind': src_p.entities[0].color = (0, 255, 0, 255) elif desc == 'in front of': src_p.entities[0].color = (255, 0, 0, 255) elif desc == 'left': src_p.entities[0].color = (0, 0, 255, 255) else: src_p.entities[0].color = (0, 255, 255, 255) objs_mesh.append (src_p) end_point = np.array(camera_position) + np.array(camera_view) # draw line of two objects lines_of_center = [[end_point, np.array(camera_position)],] p = trimesh.load_path(lines_of_center) scene_mesh.apply_transform(axis_align_matrix) # camera position camera_pos = trimesh.primitives.Sphere(radius=0.2, center=np.array(camera_position)) Scene = trimesh.Scene() Scene.add_geometry([scene_mesh, p, camera_pos]) Scene.add_geometry(objs_mesh) if not save: Scene.show() else: data = Scene.save_image(resolution=(640, 640)) save_img_name = tgt.scan_id + 'camera_view.png' save_path = os.path.join('../SSGResults/cameras', save_img_name) with open(save_path, 'wb') as f: f.write(data) #Scene.show() def read_one_obj(bbox_points, scene_file): scene_mesh = pv.read(scene_file) scene_points = scene_mesh.points # visualize scene o3d_pcd = o3d.geometry.PointCloud() o3d_pcd.points = o3d.utility.Vector3dVector(scene_points) bbox_center = np.mean(bbox_points, axis=0) bbox_size = np.max(bbox_points, axis=0) - np.min(bbox_points, axis=0) gt_o3d_box = o3d.geometry.OrientedBoundingBox(bbox_center, np.eye(3, 3), bbox_size) gt_o3d_box.color = [0, 1, 0] mesh_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.6, origin=[-0, -0, -0]) o3d.visualization.draw_geometries([o3d_pcd, gt_o3d_box, mesh_frame]) ================================================ FILE: preprocess/structured3d.py ================================================ import pickle from glob import glob from omegaconf import OmegaConf from joblib import Parallel, delayed, parallel_backend import torch import numpy as np from tqdm import tqdm from preprocess.build import ProcessorBase from preprocess.utils.label_convert import S3D_SCANNET as label_convert from preprocess.utils.constant import * PTS_LIMIT = 480000 class S3DProcessor(ProcessorBase): def record_splits(self, scan_ids): split_dir = self.save_root / 'split' split_dir.mkdir(exist_ok=True) split = { 'train': [], 'val': [], 'test': []} split['train'] = [scan_id[1] for scan_id in scan_ids if scan_id[0] == 'train'] split['val'] = [scan_id[1] for scan_id in scan_ids if scan_id[0] == 'val'] split['test'] = [scan_id[1] for scan_id in scan_ids if scan_id[0] == 'test'] for _s, _c in split.items(): with open(split_dir / f'{_s}_split.txt', 'w', encoding='utf-8') as fp: fp.write('\n'.join(_c)) def read_all_scans(self): scan_ids = [] for split in ['train', 'val', 'test']: scan_paths = glob(str(self.data_root) + f'/{split}/*') scan_ids.extend([(split, '_'.join(path.split('/')[-1].split('_')[:-2])) for path in scan_paths]) return scan_ids def process_point_cloud(self, scan_id, plydata, annotations): vertices = plydata[0] vertex_colors = (plydata[1][:,:3] + 1) / 2.0 * 255.0 vertex_instance = - np.ones((vertices.shape[0])) inst_to_label = {} for _id, _box in enumerate(annotations['gt_boxes_upright_depth']): if annotations['class'][_id] in [38, 39, 40]: continue centroid = _box[:3] dimension = _box[3:6] box_max = centroid + dimension/2 box_min = centroid - dimension/2 point_max_mask = np.all(vertices < box_max, axis=1) point_min_mask = np.all(vertices > box_min, axis=1) point_mask = np.logical_and(point_max_mask, point_min_mask) vertex_instance[point_mask] = _id inst_to_label[_id] = label_convert[annotations['class'][_id]] center_points = np.mean(vertices, axis=0) center_points[2] = np.min(vertices[:, 2]) vertices = vertices - center_points assert vertex_colors.shape == vertices.shape assert vertex_colors.shape[0] == vertex_instance.shape[0] if vertices.shape[0] > PTS_LIMIT: pcd_idxs = np.random.choice(vertices.shape[0], size=PTS_LIMIT, replace=False) vertices = vertices[pcd_idxs] colors = colors[pcd_idxs] vertex_instance = vertex_instance[pcd_idxs] if self.check_key(self.output.pcd): torch.save(inst_to_label, self.inst2label_path / f"{scan_id}.pth") torch.save((vertices, vertex_colors, vertex_instance), self.pcd_path / f"{scan_id}.pth") def scene_proc(self, scan_id): split = scan_id[0] scan_id = scan_id[1] data_root = self.data_root / split if not (data_root / f'{scan_id}_1cm_seg.pth').exists(): return if not (self.data_root.parent / 'anno_mask' / f'{scan_id}_1cm.bin').exists(): return plydata = torch.load(data_root / f'{scan_id}_1cm_seg.pth') with open(self.data_root.parent / 'anno_mask' / f'{scan_id}_1cm.bin', 'rb') as f: annotations = pickle.load(f) # process point cloud self.process_point_cloud(scan_id, plydata, annotations) def process_scans(self): scan_ids = self.read_all_scans() self.log_starting_info(len(scan_ids)) if self.num_workers > 1: with parallel_backend('multiprocessing', n_jobs=self.num_workers): Parallel()(delayed(self.scene_proc)(scan_id) for scan_id in tqdm(scan_ids)) else: for scan_id in tqdm(scan_ids): self.scene_proc(scan_id) if __name__ == '__main__': # we use the data processing for Structured3D from Swin3D, # please refer to https://github.com/yuxiaoguo/Uni3DScenes for more details. cfg = OmegaConf.create({ 'data_root': '/path/to/Structured3D/data_out/swin3d_new', 'save_root': '/output/path/to/Structured3D', 'num_workers': 1, 'output': { 'pcd': True, } }) processor = S3DProcessor(cfg) processor.process_scans() ================================================ FILE: preprocess/utils/__init__.py ================================================ ================================================ FILE: preprocess/utils/align_utils.py ================================================ import numpy as np import math def compute_box_3d(size, center, rotmat): """Compute corners of a single box from rotation matrix Args: size: list of float [dx, dy, dz] center: np.array [x, y, z] rotmat: np.array (3, 3) Returns: corners: (8, 3) """ l, h, w = [i / 2 for i in size] center = np.reshape(center, (-1, 3)) center = center.reshape(3) x_corners = [l, l, -l, -l, l, l, -l, -l] y_corners = [h, -h, -h, h, h, -h, -h, h] z_corners = [w, w, w, w, -w, -w, -w, -w] corners_3d = np.dot( np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) ) corners_3d[0, :] += center[0] corners_3d[1, :] += center[1] corners_3d[2, :] += center[2] return np.transpose(corners_3d) def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): theta = np.deg2rad(theta) cos_t = np.cos(theta) sin_t = np.sin(theta) rot_matrix = np.array([[cos_t, -sin_t, 0], [sin_t, cos_t, 0], [0, 0, 1]], pointcloud.dtype) if not clockwise: rot_matrix = rot_matrix.T return pointcloud.dot(rot_matrix) def eulerAnglesToRotationMatrix(theta): """Euler rotation matrix with clockwise logic. Rotation Args: theta: list of float [theta_x, theta_y, theta_z] Returns: R: np.array (3, 3) rotation matrix of Rz*Ry*Rx """ R_x = np.array( [ [1, 0, 0], [0, math.cos(theta[0]), -math.sin(theta[0])], [0, math.sin(theta[0]), math.cos(theta[0])], ] ) R_y = np.array( [ [math.cos(theta[1]), 0, math.sin(theta[1])], [0, 1, 0], [-math.sin(theta[1]), 0, math.cos(theta[1])], ] ) R_z = np.array( [ [math.cos(theta[2]), -math.sin(theta[2]), 0], [math.sin(theta[2]), math.cos(theta[2]), 0], [0, 0, 1], ] ) R = np.dot(R_z, np.dot(R_y, R_x)) return R def is_axis_aligned(rotated_box, thres=0.05): x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) return x_diff < thres and y_diff < thres def calc_align_matrix(bbox_list): RANGE = [-45, 45] NUM_BIN = 90 angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) angle_counts = {} for _a in angles: bucket = round(_a, 3) for box in bbox_list: box_r = rotate_z_axis_by_degrees(box, bucket) bottom = box_r[4:] if is_axis_aligned(bottom): angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 if len(angle_counts) == 0: RANGE = [-90, 90] NUM_BIN = 180 angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) for _a in angles: bucket = round(_a, 3) for box in bbox_list: box_r = rotate_z_axis_by_degrees(box, bucket) bottom = box_r[4:] if is_axis_aligned(bottom, thres=0.15): angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 most_common_angle = max(angle_counts, key=angle_counts.get) return most_common_angle ================================================ FILE: preprocess/utils/constant.py ================================================ from enum import Enum import numpy as np import matplotlib.pyplot as plt ### ScanNet200 Benchmark constants ### VALID_CLASS_IDS_200 = ( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208, 213, 214, 221, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 286, 300, 304, 312, 323, 325, 331, 342, 356, 370, 392, 395, 399, 408, 417, 488, 540, 562, 570, 572, 581, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, ) CLASS_LABELS_200 = ( "wall", "chair", "floor", "table", "door", "couch", "cabinet", "shelf", "desk", "office chair", "bed", "pillow", "sink", "picture", "window", "toilet", "bookshelf", "monitor", "curtain", "book", "armchair", "coffee table", "box", "refrigerator", "lamp", "kitchen cabinet", "towel", "clothes", "tv", "nightstand", "counter", "dresser", "stool", "cushion", "plant", "ceiling", "bathtub", "end table", "dining table", "keyboard", "bag", "backpack", "toilet paper", "printer", "tv stand", "whiteboard", "blanket", "shower curtain", "trash can", "closet", "stairs", "microwave", "stove", "shoe", "computer tower", "bottle", "bin", "ottoman", "bench", "board", "washing machine", "mirror", "copier", "basket", "sofa chair", "file cabinet", "fan", "laptop", "shower", "paper", "person", "paper towel dispenser", "oven", "blinds", "rack", "plate", "blackboard", "piano", "suitcase", "rail", "radiator", "recycling bin", "container", "wardrobe", "soap dispenser", "telephone", "bucket", "clock", "stand", "light", "laundry basket", "pipe", "clothes dryer", "guitar", "toilet paper holder", "seat", "speaker", "column", "bicycle", "ladder", "bathroom stall", "shower wall", "cup", "jacket", "storage bin", "coffee maker", "dishwasher", "paper towel roll", "machine", "mat", "windowsill", "bar", "toaster", "bulletin board", "ironing board", "fireplace", "soap dish", "kitchen counter", "doorframe", "toilet paper dispenser", "mini fridge", "fire extinguisher", "ball", "hat", "shower curtain rod", "water cooler", "paper cutter", "tray", "shower door", "pillar", "ledge", "toaster oven", "mouse", "toilet seat cover dispenser", "furniture", "cart", "storage container", "scale", "tissue box", "light switch", "crate", "power outlet", "decoration", "sign", "projector", "closet door", "vacuum cleaner", "candle", "plunger", "stuffed animal", "headphones", "dish rack", "broom", "guitar case", "range hood", "dustpan", "hair dryer", "water bottle", "handicap bar", "purse", "vent", "shower floor", "water pitcher", "mailbox", "bowl", "paper bag", "alarm clock", "music stand", "projector screen", "divider", "laundry detergent", "bathroom counter", "object", "bathroom vanity", "closet wall", "laundry hamper", "bathroom stall door", "ceiling light", "trash bin", "dumbbell", "stair rail", "tube", "bathroom cabinet", "cd case", "closet rod", "coffee kettle", "structure", "shower head", "keyboard piano", "case of water bottles", "coat rack", "storage organizer", "folded chair", "fire alarm", "power strip", "calendar", "poster", "potted plant", "luggage", "mattress", ) SCANNET_COLOR_MAP_200 = { 0: (0.0, 0.0, 0.0), 1: (174.0, 199.0, 232.0), 2: (188.0, 189.0, 34.0), 3: (152.0, 223.0, 138.0), 4: (255.0, 152.0, 150.0), 5: (214.0, 39.0, 40.0), 6: (91.0, 135.0, 229.0), 7: (31.0, 119.0, 180.0), 8: (229.0, 91.0, 104.0), 9: (247.0, 182.0, 210.0), 10: (91.0, 229.0, 110.0), 11: (255.0, 187.0, 120.0), 13: (141.0, 91.0, 229.0), 14: (112.0, 128.0, 144.0), 15: (196.0, 156.0, 148.0), 16: (197.0, 176.0, 213.0), 17: (44.0, 160.0, 44.0), 18: (148.0, 103.0, 189.0), 19: (229.0, 91.0, 223.0), 21: (219.0, 219.0, 141.0), 22: (192.0, 229.0, 91.0), 23: (88.0, 218.0, 137.0), 24: (58.0, 98.0, 137.0), 26: (177.0, 82.0, 239.0), 27: (255.0, 127.0, 14.0), 28: (237.0, 204.0, 37.0), 29: (41.0, 206.0, 32.0), 31: (62.0, 143.0, 148.0), 32: (34.0, 14.0, 130.0), 33: (143.0, 45.0, 115.0), 34: (137.0, 63.0, 14.0), 35: (23.0, 190.0, 207.0), 36: (16.0, 212.0, 139.0), 38: (90.0, 119.0, 201.0), 39: (125.0, 30.0, 141.0), 40: (150.0, 53.0, 56.0), 41: (186.0, 197.0, 62.0), 42: (227.0, 119.0, 194.0), 44: (38.0, 100.0, 128.0), 45: (120.0, 31.0, 243.0), 46: (154.0, 59.0, 103.0), 47: (169.0, 137.0, 78.0), 48: (143.0, 245.0, 111.0), 49: (37.0, 230.0, 205.0), 50: (14.0, 16.0, 155.0), 51: (196.0, 51.0, 182.0), 52: (237.0, 80.0, 38.0), 54: (138.0, 175.0, 62.0), 55: (158.0, 218.0, 229.0), 56: (38.0, 96.0, 167.0), 57: (190.0, 77.0, 246.0), 58: (208.0, 49.0, 84.0), 59: (208.0, 193.0, 72.0), 62: (55.0, 220.0, 57.0), 63: (10.0, 125.0, 140.0), 64: (76.0, 38.0, 202.0), 65: (191.0, 28.0, 135.0), 66: (211.0, 120.0, 42.0), 67: (118.0, 174.0, 76.0), 68: (17.0, 242.0, 171.0), 69: (20.0, 65.0, 247.0), 70: (208.0, 61.0, 222.0), 71: (162.0, 62.0, 60.0), 72: (210.0, 235.0, 62.0), 73: (45.0, 152.0, 72.0), 74: (35.0, 107.0, 149.0), 75: (160.0, 89.0, 237.0), 76: (227.0, 56.0, 125.0), 77: (169.0, 143.0, 81.0), 78: (42.0, 143.0, 20.0), 79: (25.0, 160.0, 151.0), 80: (82.0, 75.0, 227.0), 82: (253.0, 59.0, 222.0), 84: (240.0, 130.0, 89.0), 86: (123.0, 172.0, 47.0), 87: (71.0, 194.0, 133.0), 88: (24.0, 94.0, 205.0), 89: (134.0, 16.0, 179.0), 90: (159.0, 32.0, 52.0), 93: (213.0, 208.0, 88.0), 95: (64.0, 158.0, 70.0), 96: (18.0, 163.0, 194.0), 97: (65.0, 29.0, 153.0), 98: (177.0, 10.0, 109.0), 99: (152.0, 83.0, 7.0), 100: (83.0, 175.0, 30.0), 101: (18.0, 199.0, 153.0), 102: (61.0, 81.0, 208.0), 103: (213.0, 85.0, 216.0), 104: (170.0, 53.0, 42.0), 105: (161.0, 192.0, 38.0), 106: (23.0, 241.0, 91.0), 107: (12.0, 103.0, 170.0), 110: (151.0, 41.0, 245.0), 112: (133.0, 51.0, 80.0), 115: (184.0, 162.0, 91.0), 116: (50.0, 138.0, 38.0), 118: (31.0, 237.0, 236.0), 120: (39.0, 19.0, 208.0), 121: (223.0, 27.0, 180.0), 122: (254.0, 141.0, 85.0), 125: (97.0, 144.0, 39.0), 128: (106.0, 231.0, 176.0), 130: (12.0, 61.0, 162.0), 131: (124.0, 66.0, 140.0), 132: (137.0, 66.0, 73.0), 134: (250.0, 253.0, 26.0), 136: (55.0, 191.0, 73.0), 138: (60.0, 126.0, 146.0), 139: (153.0, 108.0, 234.0), 140: (184.0, 58.0, 125.0), 141: (135.0, 84.0, 14.0), 145: (139.0, 248.0, 91.0), 148: (53.0, 200.0, 172.0), 154: (63.0, 69.0, 134.0), 155: (190.0, 75.0, 186.0), 156: (127.0, 63.0, 52.0), 157: (141.0, 182.0, 25.0), 159: (56.0, 144.0, 89.0), 161: (64.0, 160.0, 250.0), 163: (182.0, 86.0, 245.0), 165: (139.0, 18.0, 53.0), 166: (134.0, 120.0, 54.0), 168: (49.0, 165.0, 42.0), 169: (51.0, 128.0, 133.0), 170: (44.0, 21.0, 163.0), 177: (232.0, 93.0, 193.0), 180: (176.0, 102.0, 54.0), 185: (116.0, 217.0, 17.0), 188: (54.0, 209.0, 150.0), 191: (60.0, 99.0, 204.0), 193: (129.0, 43.0, 144.0), 195: (252.0, 100.0, 106.0), 202: (187.0, 196.0, 73.0), 208: (13.0, 158.0, 40.0), 213: (52.0, 122.0, 152.0), 214: (128.0, 76.0, 202.0), 221: (187.0, 50.0, 115.0), 229: (180.0, 141.0, 71.0), 230: (77.0, 208.0, 35.0), 232: (72.0, 183.0, 168.0), 233: (97.0, 99.0, 203.0), 242: (172.0, 22.0, 158.0), 250: (155.0, 64.0, 40.0), 261: (118.0, 159.0, 30.0), 264: (69.0, 252.0, 148.0), 276: (45.0, 103.0, 173.0), 283: (111.0, 38.0, 149.0), 286: (184.0, 9.0, 49.0), 300: (188.0, 174.0, 67.0), 304: (53.0, 206.0, 53.0), 312: (97.0, 235.0, 252.0), 323: (66.0, 32.0, 182.0), 325: (236.0, 114.0, 195.0), 331: (241.0, 154.0, 83.0), 342: (133.0, 240.0, 52.0), 356: (16.0, 205.0, 144.0), 370: (75.0, 101.0, 198.0), 392: (237.0, 95.0, 251.0), 395: (191.0, 52.0, 49.0), 399: (227.0, 254.0, 54.0), 408: (49.0, 206.0, 87.0), 417: (48.0, 113.0, 150.0), 488: (125.0, 73.0, 182.0), 540: (229.0, 32.0, 114.0), 562: (158.0, 119.0, 28.0), 570: (60.0, 205.0, 27.0), 572: (18.0, 215.0, 201.0), 581: (79.0, 76.0, 153.0), 609: (134.0, 13.0, 116.0), 748: (192.0, 97.0, 63.0), 776: (108.0, 163.0, 18.0), 1156: (95.0, 220.0, 156.0), 1163: (98.0, 141.0, 208.0), 1164: (144.0, 19.0, 193.0), 1165: (166.0, 36.0, 57.0), 1166: (212.0, 202.0, 34.0), 1167: (23.0, 206.0, 34.0), 1168: (91.0, 211.0, 236.0), 1169: (79.0, 55.0, 137.0), 1170: (182.0, 19.0, 117.0), 1171: (134.0, 76.0, 14.0), 1172: (87.0, 185.0, 28.0), 1173: (82.0, 224.0, 187.0), 1174: (92.0, 110.0, 214.0), 1175: (168.0, 80.0, 171.0), 1176: (197.0, 63.0, 51.0), 1178: (175.0, 199.0, 77.0), 1179: (62.0, 180.0, 98.0), 1180: (8.0, 91.0, 150.0), 1181: (77.0, 15.0, 130.0), 1182: (154.0, 65.0, 96.0), 1183: (197.0, 152.0, 11.0), 1184: (59.0, 155.0, 45.0), 1185: (12.0, 147.0, 145.0), 1186: (54.0, 35.0, 219.0), 1187: (210.0, 73.0, 181.0), 1188: (221.0, 124.0, 77.0), 1189: (149.0, 214.0, 66.0), 1190: (72.0, 185.0, 134.0), 1191: (42.0, 94.0, 198.0), } HEAD_CATS_SCANNET_200 = ['tv stand', 'curtain', 'blinds', 'shower curtain', 'bookshelf', 'tv', 'kitchen cabinet', 'pillow', 'lamp', 'dresser', 'monitor', 'object', 'ceiling', 'board', 'stove', 'closet wall', 'couch', 'office chair', 'kitchen counter', 'shower', 'closet', 'doorframe', 'sofa chair', 'mailbox', 'nightstand', 'washing machine', 'picture', 'book', 'sink', 'recycling bin', 'table', 'backpack', 'shower wall', 'toilet', 'copier', 'counter', 'stool', 'refrigerator', 'window', 'file cabinet', 'chair', 'wall', 'plant', 'coffee table', 'stairs', 'armchair', 'cabinet', 'bathroom vanity', 'bathroom stall', 'mirror', 'blackboard', 'trash can', 'stair rail', 'box', 'towel', 'door', 'clothes', 'whiteboard', 'bed', 'floor', 'bathtub', 'desk', 'wardrobe', 'clothes dryer', 'radiator', 'shelf'] COMMON_CATS_SCANNET_200 = ["cushion", "end table", "dining table", "keyboard", "bag", "toilet paper", "printer", "blanket", "microwave", "shoe", "computer tower", "bottle", "bin", "ottoman", "bench", "basket", "fan", "laptop", "person", "paper towel dispenser", "oven", "rack", "piano", "suitcase", "rail", "container", "telephone", "stand", "light", "laundry basket", "pipe", "seat", "column", "bicycle", "ladder", "jacket", "storage bin", "coffee maker", "dishwasher", "machine", "mat", "windowsill", "bulletin board", "fireplace", "mini fridge", "water cooler", "shower door", "pillar", "ledge", "furniture", "cart", "decoration", "closet door", "vacuum cleaner", "dish rack", "range hood", "projector screen", "divider", "bathroom counter", "laundry hamper", "bathroom stall door", "ceiling light", "trash bin", "bathroom cabinet", "structure", "storage organizer", "potted plant", "mattress"] TAIL_CATS_SCANNET_200 = ["paper", "plate", "soap dispenser", "bucket", "clock", "guitar", "toilet paper holder", "speaker", "cup", "paper towel roll", "bar", "toaster", "ironing board", "soap dish", "toilet paper dispenser", "fire extinguisher", "ball", "hat", "shower curtain rod", "paper cutter", "tray", "toaster oven", "mouse", "toilet seat cover dispenser", "storage container", "scale", "tissue box", "light switch", "crate", "power outlet", "sign", "projector", "candle", "plunger", "stuffed animal", "headphones", "broom", "guitar case", "dustpan", "hair dryer", "water bottle", "handicap bar", "purse", "vent", "shower floor", "water pitcher", "bowl", "paper bag", "alarm clock", "music stand", "laundry detergent", "dumbbell", "tube", "cd case", "closet rod", "coffee kettle", "shower head", "keyboard piano", "case of water bottles", "coat rack", "folded chair", "fire alarm", "power strip", "calendar", "poster", "luggage"] VALID_CLASS_IDS_200_VALIDATION = ('wall', 'chair', 'floor', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow', 'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack', 'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', 'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine', 'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person', 'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator', 'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light', 'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher', 'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board', 'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge', 'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door', 'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door', 'vacuum cleaner', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'vent', 'shower floor', 'water pitcher', 'mailbox', 'bowl', 'paper bag', 'projector screen', 'divider', 'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper', 'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', 'closet rod', 'coffee kettle', 'shower head', 'keyboard piano', 'case of water bottles', 'coat rack', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant', 'mattress') BASE_CLASS_1 = ['cabinet', 'bed', 'chair', 'table', 'door', 'window', 'picture', 'counter', 'curtain', 'refrigerator', 'shower curtain', 'sink', 'bathtub'] NOVEL_CLASS_1 = ['sofa chair', 'bookshelf', 'desk', 'toilet'] BASE_CLASS_2 = ['cabinet', 'sofa chair', 'door', 'window', 'counter', 'desk', 'curtain', 'refrigerator', 'shower curtain', 'toilet'] NOVEL_CLASS_2 = ['bed', 'chair', 'table', 'bookshelf', 'picture', 'sink', 'bathtub'] BASE_CLASS_3 = ['cabinet', 'bed', 'chair', 'sofa chair', 'table', 'door', 'window', 'curtain'] NOVEL_CLASS_3 = ['bookshelf', 'picture', 'counter', 'desk', 'refrigerator', 'shower curtain', 'toilet', 'sink', 'bathtub'] ALL = list(CLASS_LABELS_200) RANDOM_COLOR_MAP = [] for _ in range(1000): for k in range(12): RANDOM_COLOR_MAP.append(plt.cm.Set3(k)) for k in range(9): RANDOM_COLOR_MAP.append(plt.cm.Set1(k)) for k in range(8): RANDOM_COLOR_MAP.append(plt.cm.Set2(k)) RANDOM_COLOR_MAP.append((0, 0, 0, 0)) RANDOM_COLOR_MAP = np.array(RANDOM_COLOR_MAP) * 255 class PromptType(Enum): TEXT = 1 IMAGE = 2 POINT = 3 ================================================ FILE: preprocess/utils/label_convert.py ================================================ ARKITSCENE_SCANNET= { 'bed': 'bed', 'cabinet': 'cabinet', 'refrigerator': 'refrigerator', 'table': 'table', 'chair': 'chair', 'sink': 'sink', 'stove': 'stove', 'oven': 'oven', 'washer': 'washing machine', 'shelf': 'shelf', 'tv_monitor': 'tv', 'bathtub': 'bathtub', 'toilet': 'toilet', 'sofa': 'sofa', 'stool': 'stool', 'fireplace': 'fireplace', 'build_in_cabinet': 'cabinet', 'dishwasher': 'dishwasher', 'stairs': 'stairs' } MULTISCAN_SCANNET = { "wall": "wall", "door": "door", "slippers": "shoe", "mop": "broom", "rug": "rug", "floor": "floor", "basin": "sink", "basin_stand": "sink", "bucket": "bucket", "shower": "shower", "water_tank": "container", "beam": "wood beam", "pillar": "pillar", "ceiling": "ceiling", "sink": "sink", "toilet": "toilet", "cabinet": "cabinet", "remove": "object", "towel": "towel", "pillow": "pillow", "sofa": "sofa", "footstool": "footstool", "picture": "picture", "window": "window", "heater": "heater", "mirror": "mirror", "pipe": "pipe", "scarf": "cloth", "ceiling_light": "ceiling light", "chair": "chair", "table": "table", "vent": "vent", "bag": "bag", "wall_cabinet": "cabinet", "range": "stove", "ricemaker": "rice cooker", "pan": "cooking pan", "coffee_machine": "coffee maker", "rice_bag": "bag", "light": "light", "trashbin": "trash bin", "kettle": "kettle", "refrigerator": "refrigerator", "microwave": "microwave", "light_switch": "light switch", "rice_cooker": "rice cooker", "box": "box", "shoe": "shoe", "range_hood": "range hood", "wok": "cooking pan", "router": "object", "paper_towel": "paper towel roll", "stock_pot": "pot", "cutting_board": "cutting board", "wall_calendar": "calendar", "baseboard": "object", "coke_box": "box", "printer": "printer", "bowl": "bowl", "backpack": "backpack", "baseboard_heater": "heater", "broom": "broom", "dust_pan": "dustpan", "trash_bin": "trash bin", "rigid_duct": "vent", "electric_range": "stove", "spatula": "object", "faucet": "faucet", "bottle": "bottle", "countertop": "counter", "railing": "railing", "suitcase": "suitcase", "trash": "trash can", "pot": "pot", "kitchen_tool": "object", "vegetable": "object", "board": "board", "washing_machine": "washing machine", "jar": "jar", "object": "object", "notebook": "book", "induction_cooker": "stove", "instant_pot_lid": "cooking pot", "oven": "oven", "air_fryer": "object", "lid": "pot", "sponge": "sponge", "blender": "object", "spoon": "object", "dishwasher": "dishwasher", "detergent": "laundry detergent", "watermelon": "bananas", "yard_waste_bag": "garbage bag", "container": "container", "newspapers": "paper", "rag": "cloth", "ladder": "ladder", "gate": "door", "napkin_box": "tissue box", "jacket": "jacket", "windowsill": "windowsill", "water_faucet": "faucet", "steel_ball": "ball", "rice_maker": "rice cooker", "watter_bottle": "water bottle", "plastic_bag": "bag", "paper_bag": "paper bag", "cuttting_board": "cutting board", "trash_bin_lid": "trash bin", "hair_dryer": "hair dryer", "electric_socket": "power outlet", "electric_panel": "electric panel", "wash_stand": "sink", "soap": "soap", "curtain": "curtain", "bathtub": "bathtub", "smoke_detector": "smoke detector", "roll_paper": "paper towel roll", "chandelier": "chandelier", "hand_sanitizer": "hand sanitzer dispenser", "plate": "plate", "sticker": "sticker", "power_socket": "power outlet", "stacked_cups": "stack of cups", "stacked_chairs": "stack of chairs", "air_vent": "vent", "cornice": "cabinet", "wine_cabinet": "kitchen cabinet", "crock": "bowl", "liquor_box": "cabinet", "shampoo": "shampoo", "shower_curtain": "shower curtain", "wall_light": "wall lamp", "sink_cabinet": "sink", "toilet_roll": "toilet paper", "shelf": "shelf", "paper_bin": "recycling bin", "toilet_brush": "toilet brush", "shower_head": "shower head", "tv": "tv", "remote_control": "remote", "tv_box": "tv stand", "nightstand": "nightstand", "bed": "bed", "quilt": "blanket", "telephone": "telephone", "monitor": "monitor", "desk": "desk", "radiator_shell": "radiator", "calendar": "calendar", "clock": "clock", "keyboard": "keyboard", "speaker": "speaker", "clothes": "clothes", "door_frame": "doorframe", "sliding_door": "sliding door", "ceiling_lamp": "ceiling lamp", "scale": "scale", "power_strip": "power strip", "switch": "light switch", "basket": "basket", "stool": "stool", "shoes": "shoe", "slipper": "slippers", "bifold_door": "door", "rangehood": "range hood", "books": "books", "toilet_paper": "toilet paper", "mouse_pad": "mouse", "ipad": "ipad", "scissor": "knife block", "radiator": "radiator", "pc": "computer tower", "bicycle": "bicycle", "wardrobe": "wardrobe", "mouse": "mouse", "advertising_board": "poster", "banner": "banner", "ceiling_decoration": "ceiling light", "whiteboard": "whiteboard", "wall_storage_set": "shelf", "traffic_cone": "traffic cone", "wall_decoration": "decoration", "papers": "papers", "hat": "hat", "velvet_hangers": "clothes hanger", "circular_plate": "plate", "cellphone": "telephone", "pen": "keyboard piano", "paper": "paper", "lamp": "lamp", "curtain_box": "curtains", "woodcarving": "wood", "scissors": "knife block", "hand_dryer": "hand dryer", "machine": "machine", "vase": "vase", "plant": "plant", "power_socket_case": "power outlet", "gloves": "clothes", "dishcloth": "cloth", "painting": "painting", "shower_wall": "shower wall", "showerhead": "shower head", "tooth_mug": "cup", "map": "map", "knot_artwork": "decoration", "fan": "fan", "sphygmomanometer": "scale", "electric_kettle": "kettle", "bread_maker": "oven", "knife_set": "knife block", "soup_pot": "cooking pot", "flatware_set": "cutting board", "candle": "candle", "lid_rack": "dish rack", "flower": "flowerpot", "can": "can", "scoop": "bowl", "laptop": "laptop", "glass": "glass doors", "wet_floor_sign": "wet floor sign", "shower_enclosure": "shower doors", "jewelry_box": "jewelry box", "bath_brush": "hair brush", "sofa_cushion": "couch cushions", "tv_cabinet": "tv stand", "wood_fence": "wood beam", "floor_lamp": "lamp", "computer_case": "computer tower", "waste_container": "trash bin", "roadblock": "barricade", "trash_can_lids": "trash can", "hand_sanitizer_stand": "soap dispenser", "air_conditioner": "conditioner bottle", "pattern": "rug", "remote_controller": "remote", "phone": "telephone", "speakers": "speaker", "table_divider": "divider", "table_card": "card", "paper_trimmer": "paper cutter", "stapler": "stapler", "cup": "cup", "bathroom_heater": "heater", "wall_shelf": "shelf", "towel_rack": "towel", "sink_drain": "sink", "floor_drain": "floor", "broom_head": "broom", "door_curtain": "curtain", "refill_pouch": "plastic container", "bin": "bin", "stall_wall": "bathroom stall door", "wall_speaker": "speaker", "laundry_basket": "laundry basket", "tissue_box": "tissue box", "document_holder": "file cabinet", "yoga_mat": "yoga mat", "gas_range": "stove", "chopping_board": "cutting board", "book_scanner": "scanner", "payment_terminal": "vending machine", "napkin_roll": "paper towel roll", "faucet_switch": "faucet", "glass_door": "glass doors", "carpet": "carpet", "shower_floor": "shower floor", "toilet_plunger": "plunger", "plug_panel": "power outlet", "stand": "stand", "potted_plant": "potted plant", "poster": "poster", "isolation_board": "divider", "soap_holder": "soap dish", "plug": "power outlet", "brush": "hair brush", "threshold": "doorframe", "air_conditioner_controller": "remote", "iron": "iron", "ironing_board": "ironing board", "safe": "suitcase", "gas_cooker": "stove", "pressure_cooker": "cooking pot", "steamer_pot": "pot", "soy_sauce_bottle": "bottle", "dishwashing_liquid": "dishwashing soap bottle", "water_ladle": "bowl", "power_socket_set": "power strip", "kitchen_tool_holder": "kitchen cabinet", "case": "case", "wall_paper": "wall", "comb": "hair brush", "paper_cutter": "paper cutter", "pencil_sharpener": "pen holder", "sealing_machine": "machine", "poster_board": "poster", "shredder": "shredder", "footstep": "stair", "planter": "plant", "floor_light": "lamp", "paper_cup": "cup", "divider": "divider", "hanger": "clothes hanger", "glove": "clothing", "blanket": "blanket", "remote": "remote", "cloth": "cloth", "clutter": "object", "extinguisher": "fire extinguisher", "dryer": "clothes dryer", "soap_bottle": "soap bottle", "fabric_softener_box": "box", "dryer_sheet_box": "box", "detergent_bottle": "laundry detergent", "toaster": "toaster", "stacked_bowls": "bowl", "pot_lid": "pot", "electric_pressure_cooker": "rice cooker", "bread": "food display", "bagels": "object", "oranges": "bananas", "card_reader": "card", "whiteboard_detergent": "soap dispenser", "power_outlet": "power outlet", "bouquet": "vase", "water_bottle": "water bottle", "wall_mounted_telephone": "telephone", "fridge": "refrigerator", "toy": "toy dinosaur", "shoe_box": "box", "hole_puncher": "paper cutter", "landline_telephone": "telephone", "base": "stand", "handkerchief": "cloth", "cornice_molding": "frame", "bathtub_base": "bathtub", "bidet": "toilet", "pedestal_urinal": "urinal", "pedestal_urinal_covered": "urinal", "pit_toilet": "toilet", "low_wall": "wall", "rail": "rail", "bottles": "bottles", "floor_otherroom": "floor", "wall_otherroom": "wall", "canopy": "canopy", "cable_manager": "cable", "sneakers": "shoes", "purse": "purse", "cushion": "cushion", "napkin": "towel", "plush_toy": "stuffed animal", "adjustable_desk": "desk", "tableware": "plates", "computer_desk": "desk", "cat_kennel": "cat litter box", "back_cushion": "pillow", "ukulele_bag": "guitar case", "litter_box": "trash can", "storage_box": "storage bin", "toy_doll": "doll", "drawer_unit": "drawer", "doll": "stuffed animal", "laptop_bag": "messenger bag", "clothing_rack": "clothing rack", "bookshelf": "bookshelves", "mask": "cloth", "watch": "clock", "book": "books", "ashtray": "tray", "car_key": "car", "wallet": "purse", "tea_pot": "tea kettle", "wire": "cable", "rake": "broom", "dispenser": "soap dispenser", "toilet_tank": "toilet", "door_sill": "doorframe", "cleanser": "soap", "armrest": "armchair", "short_wall": "wall", "suspended_ceiling": "ceiling", "fire_extinguisher_cabinet": "fire extinguisher", "plastic_box": "plastic container", "sanitation_station": "soap dispenser", "plant_pot": "flowerpot", "fireplace": "fireplace", "computer_table": "desk", "tissue_bag": "tissue box", "wall_frame": "frame", "map_board": "map", "automated_teller_machine": "vending machine", "ticket": "card", "tablet": "ipad", "blankets": "blanket", "bags": "bag", "flag": "flag", "blackboard": "blackboard", "bar_table": "bar", "cardboard_holder": "cardboard", "potted_planet": "potted plant", "tray": "tray", "utensil_holder": "kitchen counter", "bird_ceramics": "statue", "shirt": "shirt", "clothes_rail": "clothes hanger", "power_strips": "power strip", "card_board": "board", "pile_of_blankets": "blanket", "bed_net": "bed", "umbrella": "umbrella", "dragon_fruit": "bananas", "tissue": "tissue box", "electrical_panel": "electric panel", "panel": "door", "tube": "tube", "pile_of_cloth": "cloth", "surface": "table", "chair_cushion": "cushion", "guide": "book", "parapet": "railing", "camera": "camera", "light_base": "lamp base", "first_aid": "object", "bench": "bench", "potted_plants": "potted plant", "pot_cover": "pot", "yoga_mat_roll": "yoga mat", "panda_doll": "stuffed animal", "window_trim": "window", "shoe_cabinet": "shoe rack", "toilet_paper_holder": "toilet paper dispenser", "shower_faucet": "shower faucet handle", "bath_sponge": "sponge", "ornament": "decoration", "planter_box": "plant", "cooktop": "stove", "knife_block": "knife block", "step_stool": "step stool", "touchpad": "keyboard", "light_box": "light", "sound": "speaker", "exhaust_fan_vent": "vent", "paperbin": "recycling bin", "mop_bucket": "bucket", "sneaker": "shoes", "objects": "object", "cd_tray": "cd case", "wall_board": "board", "room_divider": "divider", "paiting": "painting", "cabinet_otherroom": "cabinet", "electric_switch": "light switch", "sign": "exit sign", "hand_soap": "soap bottle", "window_blinds": "blinds" } RSCAN_SCANNET = { 'pillow': 'pillow', 'box': 'box', 'item': 'object', 'curtain': 'curtain', 'towel': 'towel', 'garbage bin': 'trash bin', 'wall': 'wall', 'floor': 'floor', 'figure': 'statue', 'frame': 'frame', 'shelf': 'shelf', 'clothes': 'clothing', 'picture': 'picture', 'organizer': 'organizer shelf', 'ceiling': 'ceiling', 'object': 'object', 'cabinet': 'cabinet', 'blanket': 'blanket', 'monitor': 'monitor', 'door': 'door', 'roll': 'paper towel roll', 'bed': 'bed', 'desk': 'desk', 'window': 'window', 'nightstand': 'nightstand', 'rack': 'rack stand', 'plant': 'potted plant', 'cushion': 'cushion', 'light': 'light', 'table': 'table', 'windowsill': 'windowsill', 'shades': 'blinds', 'sofa': 'sofa', 'beanbag': 'beanbag chair', 'commode': 'toilet', 'heater': 'heater', 'trash can': 'trash can', 'child chair': 'stool', 'mirror': 'mirror', 'lamp': 'lamp', 'sink': 'sink', 'cupboard': 'kitchen cabinet', 'toilet paper': 'toilet paper rolls', 'toilet': 'toilet', 'handhold': 'hand rail', 'vase': 'vase', 'toilet brush': 'toilet brush', 'armchair': 'armchair', 'doorframe': 'doorframe', 'bathtub': 'bathtub', 'bath cabinet': 'bathroom cabinet', 'basket': 'basket', 'shower curtain': 'shower curtain', 'bin': 'trash bin', 'kitchen hood': 'range hood', 'kitchen cabinet': 'kitchen cabinet', 'kitchen sofa': 'sofa', 'chair': 'chair', 'rag': 'towel', 'kitchen counter': 'kitchen counter', 'oven': 'oven', 'microwave': 'microwave', 'fruit plate': 'plate', 'player': 'keyboard piano', 'kitchen appliance': 'microwave', 'kettle': 'kettle', 'wardrobe': 'wardrobe closet', 'stool': 'stool', 'stand': 'stand', 'shoes': 'shoes', 'counter': 'counter', 'hand dryer': 'hand dryer', 'suitcase': 'suitcase', 'closet': 'closet', 'tv': 'tv', 'bag': 'bag', 'laptop': 'laptop', 'jalousie': 'blinds', 'whiteboard': 'whiteboard', 'planter': 'flowerpot', 'shower': 'shower', 'hanging cabinet': 'kitchen cabinets', 'flower': 'plant', 'washbasin': 'sink', 'clothes dryer': 'clothes dryers', 'sack': 'bag', 'basin': 'sink', 'radiator': 'radiator', 'refrigerator': 'refrigerator', 'clutter': 'object', 'vacuum cleaner': 'vacuum cleaner', 'shelf unit': 'shelf', 'mop': 'broom', 'ironing board': 'ironing board', 'iron': 'iron', 'bucket': 'bucket', 'toy': 'doll', 'stairs': 'stairs', 'barrel': 'container', 'washing machine': 'washing machine', 'carpet': 'carpet', 'sidecouch': 'couch', 'tv stand': 'tv stand', 'bench': 'bench', 'humidifier': 'humidifier', 'hanger': 'clothes hanger', 'backpack': 'backpack', 'drawer': 'drawer', 'console': 'computer tower', 'hangers': 'clothes hanger', 'blinds': 'blinds', 'balcony door': 'door', 'upholstered wall': 'wall', 'coffee table': 'coffee table', 'blackboard': 'blackboard', 'glass wall': 'window', 'bottles': 'bottle', 'pack': 'bag', 'scale': 'scale', 'ventilation': 'fan', 'paper towel': 'paper towel roll', 'bottle': 'bottle', 'dish dryer': 'dish rack', 'candle': 'candle', 'pc': 'computer tower', 'washing': 'washing machine', 'tube': 'tube', 'snowboard': 'board', 'board': 'board', 'pipe': 'pipe', 'water heater': 'water heater', 'vacuum': 'vacuum cleaner', 'stuffed animal': 'stuffed animal', 'decoration': 'decoration', 'shower wall': 'shower wall', 'telephone': 'telephone', 'plate': 'plate', 'watering can': 'can', 'device': 'object', 'stove': 'stove', 'kitchen towel': 'towel', 'garbage': 'trash can', 'shampoo': 'shampoo bottle', 'statue': 'statue', 'shower door': 'shower door', 'book': 'book', 'fan': 'fan', 'speaker': 'speaker', 'pile of books': 'books', 'side table': 'end table', 'table lamp': 'table lamp', 'couch': 'couch', 'magazine': 'magazine', 'papers': 'papers', 'books': 'books', 'furniture': 'furniture', 'magazine files': 'magazine rack', 'mannequin': 'object', 'boxes': 'boxes', 'clock': 'clock', 'cube': 'object', 'napkins': 'cloth', 'stuffed': 'stuffed animal', 'luggage': 'luggage', 'partition': 'divider', 'trash': 'trash can', 'coffee': 'coffee maker', 'bar': 'bar', 'newspaper': 'paper', 'wood': 'wood', 'fireplace': 'fireplace', 'dining': 'dining table', 'dining table': 'dining table', 'bread': 'object', 'fruits': 'object', 'kitchen': 'kitchen cabinet', 'can': 'can', 'squeezer': 'object', 'bowl': 'bowl', 'recycle': 'recycling bin', 'barstool': 'stool', 'computer': 'computer tower', 'umbrella': 'umbrella', 'bath': 'bathtub', 'hanging': 'hanging', 'rocking': 'object', 'objects': 'object', 'flowers': 'plant', 'plants': 'plant', 'jar': 'jar', 'bedside': 'nightstand', 'buggy': 'object', 'side': 'object', 'socket': 'power outlet', 'showcase': 'display case', 'drying': 'drying rack', 'ottoman': 'ottoman', 'pictures': 'pictures', 'storage': 'storage bin', 'footstool': 'footstool', 'folding': 'folded chair', 'ladder': 'ladder', 'shoe': 'shoes', 'pet': 'object', 'medical': 'object', 'soap': 'soap', 'balcony': 'object', 'foosball': 'foosball table', 'hand': 'object', 'bookshelf': 'bookshelf', 'pile': 'object', 'cleaning': 'object', 'flush': 'toilet flush button', 'towels': 'towels', 'candlestick': 'candle', 'puf': 'object', 'printer': 'printer', 'shelves': 'shelf', 'stair': 'stair', 'cleanser': 'soap bottle', 'armoire': 'wardrobe closet', 'bidet': 'object', 'exit': 'exit sign', 'toaster': 'toaster', 'laundry': 'laundry basket', 'hood': 'range hood', 'sponge': 'sponge', 'fridge': 'refrigerator', 'breadboard': 'cutting board', 'pan': 'frying pan', 'water': 'water bottle', 'teapot': 'tea kettle', 'projector': 'projector', 'juicer': 'kitchen mixer', 'cutting': 'cutting board', 'windows': 'windowsill', 'food': 'food container', 'cup': 'cup', 'rug': 'rug', 'column': 'column', 'keyboard': 'keyboard', 'office': 'office chair', 'exhaust': 'range hood', 'apron': 'kitchen apron', 'pepper': 'salt', 'knife': 'knife block', 'cooking': 'cooking pot', 'tablet': 'ipad', 'bicycle': 'bicycle', 'pillar': 'pillar', 'machine': 'washing machine', 'meter': 'scale', 'cut': 'paper cutter', 'salt': 'salt', 'candles': 'candle', 'grass': 'plant', 'sidetable': 'end table', 'sewing': 'sewing machine', 'guitar': 'guitar', 'flag': 'flag', 'paper': 'paper', 'sugar': 'bowl', 'cups': 'cups', 'packs': 'boxes', 'plates': 'plates', 'tray': 'tray', 'chandelier': 'chandelier', 'mandarins': 'bananas', 'puppet': 'doll', 'painting': 'painting', 'cradle': 'crib', 'price': 'tag', 'dish': 'dish rack', 'boiler': 'boiler', 'fruit': 'bananas', 'multicooker': 'rice cooker', 'items': 'object', 'extractor': 'juicer', 'air': 'fan', 'dressing': 'mirror', 'round': 'round table', 'screen': 'screen', 'mattress': 'mattress', 'bike': 'bicycle', 'rolled': 'rolled poster', 'locker': 'cabinet', 'tennis': 'tennis racket', 'cap': 'cap', 'ball': 'ball', 'folder': 'folder', 'milk': 'refridgerator', 'dishdrainer': 'dish rack', 'dishwasher': 'dishwasher', 'piano': 'piano', 'stereo': 'speaker', 'upholstered': 'couch', 'folded': 'folded chairs', 'loft': 'loft bed', 'aquarium': 'fish', 'dispenser': 'soap dispenser', 'body': 'person', 'sign': 'sign', 'baby': 'crib', 'chest': 'chest', 'pot': 'pot', 'drawers': 'drawer', 'rail': 'rail', 'platform': 'platform', 'tree': 'plant', 'armor': 'helmet', 'ironing': 'ironing board', 'headboard': 'headboard', 'crib': 'crib', 'beverage': 'bottle', 'plank': 'wood', 'generator': 'machine', 'file': 'file cabinet', 'coat': 'coat rack', 'tool': 'toolbox', 'rolling': 'cart', 'tire': 'tire', 'cable': 'cable', 'fence': 'gate', 'handrail': 'handrail', 't-shirt': 'shirt', 'ramp': 'stairs', 'seat': 'seat', 'sideboard': 'cabinet', 'lounger': 'chair', 'discs': 'cd case', 'drum': 'drum set', 'drinks': 'soda can', 'chairs': 'chair', 'dishes': 'dish rack', 'linen': 'towel', 'glass': 'glass', 'xbox': 'xbox controller', 'ukulele': 'guitar', 'pin': 'needle' } HM3D_SCANNET = { "wall": "wall", "picture": "picture", "floor": "floor", "fireplace": "fireplace", "window": "window", "window frame": "window", "door": "door", "door knob": "door", "door frame": "doorframe", "ceiling": "ceiling", "ceiling fan": "ceiling fan", "fireplace shelf": "fireplace", "hearth": "fireplace", "fireplace floor": "fireplace", "armchair": "armchair", "table": "table", "coffee table": "coffee table", "table lamp": "table lamp", "sofa": "sofa", "pillow": "pillow", "tv stand": "tv stand", "tv": "tv", "device": "object", "chair": "chair", "cutlery": "knife block", "plate": "plate", "napkins": "towel", "ceiling lamp": "ceiling light", "kitchen cabinet": "kitchen cabinet", "shelf": "shelf", "fridge": "refrigerator", "microwave": "microwave", "kitchen cabinet lower": "kitchen cabinet", "coffee machine": "coffee maker", "oven": "oven", "kettle": "kettle", "tray": "tray", "knife set": "kitchen cabinet", "wall lamp": "wall lamp", "sink": "sink", "tap": "faucet", "detergent": "soap", "unknown": "object", "toaster": "toaster", "dishwasher": "dishwasher", "cabinet": "cabinet", "bed": "bed", "bedside table": "nightstand", "clock": "clock", "air vent": "vent", "hanger": "clothes hanger", "mirror": "mirror", "wash cabinet": "bathroom vanity", "washbasin": "sink", "faucet": "faucet", "cosmetic": "toiletry", "soap": "soap", "cosmetics": "toiletry", "towel": "towel", "bin": "bin", "toilet paper": "toilet paper", "toilet": "toilet", "bath": "bathtub", "bath curtain": "shower curtain", "curtain bar": "curtain rod", "bath shelf": "shelf", "bath dial": "shower faucet handle", "bath faucet": "faucet", "decoration": "decoration", "tissue box": "tissue box", "hand towel": "hand towel", "storage unit": "storage bin", "shower wall": "shower wall", "shower seat": "seat", "shower floor": "shower floor", "shower cabin": "shower", "shower curtain": "shower curtain", "shower bar": "handrail", "showerhead": "shower head", "shower dial": "shower faucet handle", "shower hanger": "clothes hanger", "rug": "rug", "fire detector": "smoke detector", "banister": "banister", "plunger": "plunger", "rod": "rod", "washer-dryer": "washing machine", "couch": "couch", "lamp": "lamp", "pouffe": "ottoman", "furniture": "furniture", "fan": "fan", "chest of drawers": "dresser", "curtain": "curtain", "curtain rod": "curtain rod", "desk": "desk", "hanging clothes": "clothing", "barrel": "container", "bathtub": "bathtub", "drawer": "drawer", "countertop": "counter", "bathroom shelf": "shelf", "knob": "door", "toilet brush": "toilet", "shower knob": "shower", "stairs": "staircase", "handrail": "banister", "bathroom cabinet": "bathroom vanity", "paper towel": "paper towel dispenser", "towel ring": "towel", "towel bar": "towel", "kitchen counter": "kitchen cabinet", "refrigerator": "mini fridge", "vase": "flowerpot", "kitchen utensil": "cooking pot", "kitchen countertop item": "kitchen counter", "shelving": "shelf", "pitcher": "water pitcher", "bowl": "bowl", "kitchen island": "kitchen counter", "trashcan": "trash can", "stove": "oven", "box": "storage box", "clutter": "object", "painting": "picture", "book": "bookshelf", "toy": "stuffed animal", "heater": "radiator", "ceiling vent": "vent", "floor mat": "mat", "hand soap": "soap dispenser", "flower": "potted plant", "toilet paper dispenser": "toilet paper", "bathrobe": "bathrobe", "bed table": "nightstand", "bedside lamp": "lamp", "folding chair": "folded chair", "patio chair": "chair", "grill": "oven", "balustrade": "banister", "attic door": "door", "sensor": "smoke detector", "wall hanging decoration": "decoration", "doormat": "mat", "clothes hanger": "clothes", "wall cabinet": "cabinet", "wall clock": "clock", "led tv": "tv", "fireplace wall": "fireplace", "firewood holder": "wood", "floor lamp": "lamp", "curtain rail": "curtain rod", "wine rack": "bar", "wine bottle": "bottle", "wall electronics": "power outlet", "washing machine": "washing machines", "kitchen appliance": "kitchen cabinets", "bed light": "lamp", "electric box": "electric panel", "guitar": "guitar case", "media console": "tv stand", "newspaper": "magazine", "wardrobe": "wardrobe closet", "bottle of soap": "soap bottle", "ventilation hood": "range hood", "sauna heater": "heater", "sauna bowl": "bowl", "rail": "handrail", "spa bench": "bench", "bathroom utensil": "toiletry", "recessed wall": "wall", "art frame": "picture", "appliance": "machine", "decorative plant": "potted plant", "flowerpot": "vase", "door/window frame": "doorframe", "cardboard box": "box", "shoe": "shoes", "clothes": "clothing", "clothes rack": "clothing rack", "shelf with clutter": "shelf", "case": "suitcase", "backpack": "messenger bag", "hat": "cap", "storage cabinet": "cabinet", "bag": "grocery bag", "basket of something": "basket", "blanket": "blanket", "vacuum cleaner": "roomba", "window shutter": "blinds", "exercise bike": "exercise machine", "door/window": "doorframe", "plant": "potted plant", "bath sink": "sink", "shower hose": "shower head", "ironing board": "iron", "stand": "podium", "rack": "rack stand", "bottle of detergent": "laundry detergent", "storage box": "storage container", "jar": "container", "ceiling dome": "ceiling", "container": "box", "refrigerator cabinet": "refrigerator", "compound wall": "wall", "glass": "cup", "bottle": "bottle", "speaker": "speaker", "telephone": "telephone", "sofa chair": "sofa chair", "small table/stand": "end table", "blinds": "blinds", "stairs railing": "stair rail", "bar": "bar", "air conditioner": "vent", "bed stand": "nightstand", "shower door frame": "shower door", "bathroom accessory": "soap dispenser", "trash can": "trash can", "liquid soap": "soap", "desk chair": "office chair", "desk clutter": "papers", "book rack": "bookshelf", "wall panel": "wall", "closet area for hanging clothes": "wardrobe closet", "laundry basket": "laundry basket", "shower rod": "shower curtain rod", "shower hose/head": "shower head", "box of tissues": "tissue box", "shower ceiling": "ceiling", "plush toy": "stuffed animal", "dresser": "dresser", "recycle bin": "recycling bin", "desk lamp": "desk lamp", "basket": "basket", "drum": "instrument case", "stack of papers": "papers", "bath towel": "towel", "bath cabinet": "bathroom cabinet", "carpet": "carpet", "printer": "printer", "bench": "bench", "flower vase": "vase", "furnace": "heater", "sink cabinet": "bathroom vanity", "paper": "paper", "bath wall": "bath walls", "shower soap shelf": "soap dish", "chandelier": "chandelier", "window curtain": "curtain", "monitor": "monitor", "keyboard": "keyboard", "computer mouse": "mouse", "computer desk": "desk", "board": "board", "snack": "food container", "machine": "machine", "mini fridge": "mini fridge", "arcade game": "machine", "exercise machine": "exercise machine", "shutter": "door", "pad": "pillow", "computer equipment": "computer tower", "computer chair": "office chair", "wall shelf": "shelf", "staircase handrail": "handrail", "bed small": "bed", "dog bed": "pillow", "photo": "picture", "lamp table": "table lamp", "footstool": "footstool", "bath utensil": "bath products", "brush": "hair brush", "chest": "chest", "throw blanket": "blanket", "bowl of fruit": "bowl", "highchair": "seat", "laptop": "laptop", "file cabinet": "file cabinet", "food stand": "food display", "dining table": "dining table", "wall toilet paper": "toilet paper holder", "cabinet table": "cabinet", "ventilation": "vent", "wall indent": "wall", "window glass": "window", "bath tub": "bathtub", "cabinet door": "cabinet door", "support beam": "wood beam", "holder": "soap dish", "side table": "end table", "smoke detector": "smoke detector", "dinnerware": "plates", "dinner table": "dining table", "mirror /otherroom": "mirror", "shower": "shower", "pipe": "pipe", "motion detector": "alarm", "paper towel holder": "paper towel dispenser", "ornament": "decoration", "bedside cabinet": "nightstand", "ceiling door": "door", "stool": "stool", "countertop item": "kitchen counter", "island": "kitchen island", "range hood": "range hood", "door cabinet": "cabinet door", "window /outside": "window", "frame": "frame", "rocking chair": "chair", "fruit": "banana holder", "tissue": "tissue box", "plate of food": "plate", "shower curtain rod": "shower curtain rod", "shower tap": "shower faucet handle", "soapbox": "soap bar", "soap dispenser": "soap dispenser", "remote control": "remote", "ceiling duct": "vent", "pool stick": "pool table", "statue": "statue", "mat": "mat", "fire alarm": "fire alarm", "toilet brush holder": "toilet brush", "vent": "vent", "step": "step", "shower pipe": "shower head", "laundry machine": "washing machine", "bucket": "bucket", "broom": "broom", "cutting board": "cutting board", "oven and stove": "oven", "oven vent": "vent", "kitchen countertop items": "kitchen counter", "kitchen lower cabinet": "kitchen cabinet", "grate": "vent", "beam": "wood beam", "pillar": "pillar", "dining chair": "chair", "basket of towels": "towels", "stair handle": "handrail", "stair": "stair", "stair wall": "wall", "exercise ball": "exercise ball", "exercise equipment": "exercise machine", "exercise mat": "yoga mat", "bedframe": "bedframe", "lamp stand": "lamp base", "pool table": "pool table", "dartboard": "dart board", "bar cabinet": "bar", "bar chair": "chair", "display cabinet": "display case", "display table": "table", "screen": "screen", "teapot": "tea kettle", "railing": "railing", "unknown/remove": "object", "alarm clock": "alarm clock", "alarm": "alarm", "hunting trophy": "decoration", "water dispenser": "water cooler", "antique telephone": "telephone", "calendar": "calendar", "knife holder": "rack", "spice rack": "rack", "fireplace tool set": "fireplace", "firewood": "wood", "ceiling under staircase": "ceiling", "can": "can", "bathtub platform": "bathtub", "scale": "scale", "tile": "floor", "belt": "clothing", "parapet": "railing", "three": "object", "computer": "computer tower", "treadmill": "treadmill", "duct": "vent", "electrical box": "electric panel", "stair frame": "stair", "ladder": "ladder", "water heater": "water heater", "heater piping": "pipe", "exhaust pipe": "pipe", "pump": "water cooler", "hose": "hose", "switch": "light switch", "storage bin": "storage bin", "plastic bag": "bag", "wall board": "board", "washing machine and dryer": "washing machine", "flag": "flag", "can of paint": "can", "broomstick": "broom", "plumbing": "pipe", "column": "column", "ceiling pipe": "pipe", "cables": "power strip", "landing": "stairs", "ledge": "ledge", "mop": "broom", "tv remote": "remote", "closet door": "closet door", "door window": "door", "door hinge": "door", "doorway": "doorframe", "alarm control": "alarm", "radiator": "radiator", "ceiling light fixture connection": "ceiling light", "mirror frame": "mirror", "mantel": "fireplace", "firebox": "fireplace", "fireplace sconce": "fireplace", "fume cupboard": "cabinet", "decorative bowl": "bowl", "stonework": "wall", "kitchen shelf": "kitchen cabinet", "window shade": "curtain", "radio": "speaker", "cooker": "stove", "kitchen table": "dining table", "cup": "cup", "tiles": "floor", "washbasin counter": "sink", "doorstep": "door", "stairs wall": "stairs", "stairs trim": "stairs", "stacked chair": "stack of chairs", "carpet roll": "carpet", "safe": "cabinet", "briefcase": "briefcase", "door stopper": "door", "shades": "blinds", "electric cable": "power strip", "nightstand": "nightstand", "clothes hanger rod": "closet rod", "shower stall": "shower", "sink/basin": "sink", "worktop": "kitchen counter", "basin": "sink", "diploma": "frame", "garage door": "garage door", "garage door frame": "garage door", "garage door opener": "garage door", "boxes": "boxes", "jacket": "jacket", "tree": "plant", "tree branch": "plant", "shovel": "broom", "tool": "toolbox", "binder": "binders", "paper storage": "paper tray", "folder": "file cabinet", "flower stand": "plant", "coat hanger": "clothes hanger", "wall detail": "wall", "boiler": "boiler", "shower-bath cabinet": "bathroom cabinet", "sofa seat": "sofa", "platform": "platform", "panel": "wall", "coffee maker": "coffee maker", "dish cabinet": "dish rack", "candle holder": "candle", "ceiling molding": "ceiling", "bed comforter": "bed", "cuddly toy": "teddy bear", "decorative plate": "plate", "figure": "statue", "bouquet": "flowerpot", "kitchen extractor": "range hood", "pot": "pot", "vessel": "bowl", "kitchen top": "kitchen counter", "painting frame": "painting", "casket": "chest", "bathroom towel": "towel", "washing stuff": "soap", "shelf cubby": "shelf", "shower rail": "shower curtain rod", "cloth": "cloth", "stick": "rod", "luggage": "luggage", "sconce": "wall lamp", "lounge chair": "armchair", "patio floor": "floor", "roof": "ceiling", "jewelry": "jewelry box", "bedroom ceiling": "ceiling", "cushion": "pillow", "ceiling bedroom": "ceiling", "record player": "speaker", "perfume": "soap", "shower handle": "shower faucet handle", "shampoo": "shampoo bottle", "weight": "dumbbell", "candlestick": "candle", "cabinet kitchen": "kitchen cabinet", "antique clock": "clock", "picture frame": "picture", "candle": "candle", "air conditioning": "fan", "floor /outside": "floor", "fence": "rail", "canopy": "canopy", "end table": "end table", "shelf with art": "shelf", "dishrag": "towel", "staircase trim": "staircase", "bookshelf": "bookshelf", "decorative quilt": "blanket", "shower tub": "bathtub", "clothes dryer": "clothes dryer", "clothes hamper": "laundry hamper", "bathroom counter": "bathroom counter", "cart": "cart", "weight bench": "bench", "rack of weights": "dumbbell", "ceiling under stairs": "ceiling", "storage shelving": "storage shelf", "office chair": "office chair", "doll": "doll", "step stool": "step stool", "pc tower": "computer tower", "control panel": "electric panel", "umbrella": "umbrella", "food": "food container", "closet": "closet", "fireplace utensil": "fireplace", "holy cross": "decoration", "tablet": "ipad", "mug": "mug", "box of tissue": "tissue box", "sled": "luggage stand", "electrical controller": "electric panel", "dressing table": "dresser", "freezer": "refrigerator", "paneling": "wall", "object": "object", "pile of magazines": "magazine", "objects": "object", "counter": "counter", "outlet": "power outlet", "fire extinguisher": "fire extinguisher", "entertainment set": "tv stand", "window shutters": "blinds", "window valence": "curtain", "shower curtain bar": "shower curtain rod", "statue/art": "statue", "brochure": "paper", "trinket": "decoration", "toilet cleaner": "toilet brush", "air duct": "vent", "plank": "wood", "bed cabinet": "nightstand", "ottoman": "ottoman", "air refresher": "spray bottle", "cleaner": "vacuum cleaner", "shower shelf": "shower", "ceiling lower": "ceiling", "cases": "suitcases", "magazines": "magazine", "grab bar": "grab bar", "bath door frame": "doorframe", "shower door knob": "shower door", "document": "paper", "cleaning paper": "paper towel roll", "bath floor": "floor", "shower bench": "bench", "shower step": "step", "slippers": "slippers", "boots": "shoes", "weights": "dumbbell", "cap": "cap", "note": "paper", "golf sticks": "golf bag", "glasses": "mirror", "medal collection": "decoration", "decorative dinnerware": "plate", "music player": "speaker", "jug": "pitcher", "sliding door": "door", "kitchen knife set": "kitchen cabinet", "sitting bench": "bench", "bowl with sweets": "bowl", "decorative vase": "vase", "rolled carpet": "carpet", "ball": "ball", "dumbbell": "dumbbell", "folded chair": "folded chair", "screen frame": "screen", "speaker stand": "speaker", "projector": "projector", "bath tap": "faucet", "stuffed animal": "stuffed animal", "ceiling window": "window", "kitchen handle": "kitchen cabinet", "podium": "podium", "office table": "desk", "button": "toilet flush button", "decoder": "computer tower", "food tray": "tray", "elevator": "elevator", "bathtub utensil": "bathtub", "footrest": "footrest", "stair step": "stairs", "ceiling light": "ceiling light", "telescope": "telescope", "ping pong table": "ping pong table", "photo mount": "picture", "bookstand": "bookshelf", "curtain rod cover": "curtain rod", "baby changing table": "changing station", "bulletin board": "bulletin board", "stack of jackets": "jacket", "crib": "crib", "kitchen cabinet door": "cabinet door", "kitchen cabinet drawer": "kitchen cabinet", "potty": "toilet", "bottles of wine": "beer bottles", "table cloth": "table", "iron board": "ironing board", "shoes": "shoes", "bottles of water": "water bottle", "watch": "clock", "cable": "power outlet", "desk door": "desk", "notebook": "book", "file binder": "binders", "bathroom stuff": "bath products", "bath mat": "mat", "handbag": "bag", "robe": "bathrobe", "sculpture": "statue", "kitchen decoration": "decoration", "pan": "cooking pan", "handle": "door", "bicycle": "bicycle", "piano": "piano", "piano stool": "piano bench", "wall post": "wall", "magazine": "magazine", "cover": "cover", "tool box": "toolbox", "wood": "wood", "support": "structure", "hook": "rack", "gravel": "floor", "rafter": "wood beam", "soap bottle": "soap bottle", "dustbin": "trash bin", "light": "light", "sign": "sign", "plates": "plate", "thermostat": "thermostat", "mascot": "stuffed animal", "stack of books": "stack of chairs", "stack of books / papers": "papers", "decorative cloth": "cloth", "locker": "cabinet", "wine storage": "cabinet", "cleaning liquid": "dishwashing soap bottle", "cleaning sponge": "sponge", "stack of pots": "pot", "seat": "seat", "air vent fan": "fan", "shower handrail": "handrail", "ceiling fire detector": "smoke detector", "ceiling fan vent": "vent", "curtain hanger": "curtain rod", "bath bar": "bath walls", "book cabinet": "bookshelf", "desk cabinet": "desk", "papers": "papers", "sheets / clothes": "clothes", "wall hanger": "wall mounted coat rack", "ceiling lamp hanger": "ceiling lamp", "stack of stuff": "stack of chairs", "socket": "power outlet", "shower door": "shower door", "fan air vent": "vent", "crate": "crate", "wire": "power outlet", "installation": "structure", "sideboard": "cabinet", "wall sign": "sign", "camera": "camera", "urinal": "urinal", "stone support structure": "structure", "bottles": "bottles", "bathroom mat": "mat", "shower ceiling lamp": "ceiling lamp", "shower cosmetics": "bath products", "fire sprinkler": "fire extinguisher", "bowl of sweets": "bowl", "sprinkler": "hose", "self-closing mechanism": "door", "utensil": "kitchen cabinet", "control": "light switch", "headphones": "headphones", "bidet": "toilet", "arch": "doorframe", "closet shelving": "closet", "bean bag chair": "beanbag chair", "scarf": "clothes", "shelf clutter": "shelf", "bed sheet": "bed", "bowl of fruits": "bowl", "cabinet clutter": "cabinet", "light fixture": "ceiling light", "dvd player": "tv", "surface": "table", "surfboard": "board", "whine shelf": "shelf", "amplifier": "speaker", "stereo": "speaker", "fire screen": "fireplace", "ice maker": "refrigerator", "knife": "kitchen cabinet", "closet mirror wall": "mirror", "table tennis table": "ping pong table", "baby changing station": "changing station", "book display": "bookshelf", "sliding glass door": "door", "l-shaped sofa": "sofa", "soap dish": "soap dish", "mixer": "kitchen mixer", "puppet": "doll", "electric guitar": "guitar", "music album shelf": "shelf", "bathroom floor": "floor", "powder soap": "soap", "decorative lamp": "lamp", "toothbrush": "toothbrush", "shower grab bar": "grab bar", "candle stand": "candle", "electric outlet": "power outlet", "photos": "pictures", "shower caddy": "shower", "mouse": "mouse", "car model": "car", "guitar frame": "guitar case", "ship toy": "boat", "platter": "plate", "decorative bottle": "bottle", "audio player": "speaker", "firewood chest": "chest", "kitchen seating": "seat", "hats": "hat", "cat toilet": "cat litter box", "electric kettle": "kettle", "security camera": "camera", "light switch": "light switch", "schedule": "calendar", "globe": "globe", "magazine rack": "magazine", "ceiling/west wall": "ceiling", "shelf with shoes": "shoe rack", "kitchen lower shelf": "kitchen cabinet", "sewing machine": "sewing machine", "sewing set": "sewing machine", "sewing tools": "sewing machine", "strings": "rope", "decorative lantern": "lamp", "paper towel dispenser": "paper towel dispenser", "stack of cds": "cd case", "stationery": "paper", "shutters": "blinds", "keys": "cabinet", "rope": "rope", "iron": "iron", "trolley": "trolley", "closet shelf": "closet", "ceiling wall": "ceiling", "coach": "sofa", "sink table": "sink", "pool": "pool table", "entrance arch": "doorframe", "ceiling arch": "ceiling", "arcade": "object", "ceiling support": "pillar", "map": "map", "stained glass": "window", "ceiling decorative lamp": "ceiling light", "bust": "statue", "framed text": "picture", "ceiling boarder": "ceiling", "exhibition window frame": "window", "exhibition window": "window", "elevator door": "elevator", "exhibition panel": "display case", "information": "sign", "chassis": "structure", "ceiling ladder": "ladder", "exhibition picture": "picture", "exhibition table": "table", "ship model": "boat", "garage door opener motor": "garage door", "garage door opener bar": "bar", "garage door railing": "railing", "riser": "stairs", "dustpan": "dustpan", "coat": "coat", "doorpost": "doorframe", "backrest": "chair", "fruit bowl": "bowl", "easy chair": "armchair", "stone": "object", "ashtray": "container", "smoke alarm": "smoke detector", "headboard": "headboard", "bedpost": "bedframe", "wall cubby": "shelf", "toiletry": "toiletry", "globe stand": "stand", "swivel chair": "office chair", "bar soap": "soap", "sheet": "bed", "shirt": "shirt", "coffee mug": "mug", "toilet seat": "toilet", "door handle": "door", "exit sign": "exit sign", "kitchen wall": "wall", "washcloth": "washcloth", "table stand": "stand", "wall /outside": "wall", "gym rope": "rope", "gym equipment": "dumbbell", "barbell": "dumbbell", "gym stepper": "stair", "gym mat": "mat", "wall beam": "wood beam", "meshwork": "object", "punchbag": "bag", "boxing ring": "structure", "water tank": "container", "exercise mat roll": "mat", "gate": "door", "trophy": "statue", "payment terminal": "object", "cash register": "object", "power strip": "power strip", "cloth hanger": "clothes hanger", "jewelry box": "jewelry box", "extension cord": "power outlet", "gramophone": "music stand", "stereo set": "speaker", "newspaper basket": "basket", "phone": "telephone", "hourglass": "clock", "cat": "cat litter box", "balcony railing": "railing", "bed curtain": "curtain", "swing": "furniture", "skateboard": "board", "cat tree": "furniture", "laundry": "clothes", "clothing stand": "clothing rack", "sink pipe": "pipe", "trash bag": "trash bag", "lid": "cover", "foosball game table": "foosball table", "workout bike": "exercise machine", "canvas": "picture", "cat food": "food container", "chair /w clutter": "chair", "wreath": "decoration", "shade": "curtain", "wall soap shelf": "soap dish", "hair brush": "hair brush", "hair dryer": "hair dryer", "shower mat": "mat", "beanbag chair": "beanbag chair", "yoga mat": "yoga mat", "computer tower": "computer tower", "canister": "container", "motorcycle": "bicycle", "cabinet /w clutter": "cabinet", "tire": "tire", "axe": "toolbox", "partition": "divider", "electric wire casing": "power outlet", "cleaning clutter": "broom", "shower tray": "shower", "fuse box": "fuse box", "mortar": "container", "soap dish cubby": "soap dish", "curtain valence": "curtain", "beanbag": "beanbag chair", "boxes with books": "boxes", "wine": "bottle", "box with books": "box", "electric heater": "heater", "hot water/cold water knob": "faucet", "wall tv": "tv", "box of something": "box", "garage door motor": "garage door", "fuse panel": "electric panel", "closet storage area": "closet", "lampshade": "lamp", "dvd": "object", "curtain box": "curtain", "board game": "object", "easel": "easel", "blackboard": "blackboard", "overhang": "structure", "drainpipe": "pipe", "grass": "floor", "patio": "floor", "wall statue": "statue", "recessed shelving": "shelf", "cornice": "wall", "coat rack": "coat rack", "candelabra": "chandelier", "tea set": "cup", "purse": "purse", "notes": "paper", "wall outside": "wall", "office wall": "wall", "floor stand": "stand", "banner": "banner", "cups": "cups", "drawer sink table": "sink", "liquid cleaner": "spray bottle", "wall panel frame": "frame", "poster": "poster", "brochures": "papers", "calculator": "object", "pavement": "floor", "chest drawer": "dresser", "pen cup": "cup", "pendant": "object", "box pen": "box", "bathtub tap": "bathtub", "sink tap": "sink", "sewing box": "box", "attic hatch": "ceiling", "shredder": "shredder", "decor": "decoration", "board games": "object", "chess": "object", "piano bench": "piano bench", "basket with books": "basket", "cross": "object", "kitchen utensils": "kitchen cabinet", "christmas tree": "decoration", "containers": "container", "cans of paint": "can", "cardboard": "cardboard", "basketballs": "ball", "sleeping bag": "sleeping bag", "cloth hangers": "clothes hanger", "scoop": "object", "electrical device": "electric panel", "chimney": "structure", "bags": "bag", "box of food": "box", "liquid container": "bottle", "staircase wall": "wall", "garage door opener railing": "garage door", "electric wire": "electric panel", "silicone gun": "object", "wheel": "wheel", "drawers": "drawer", "boards": "board", "spray": "spray bottle", "car": "car", "bicycle helmets": "helmet", "baseball bat": "object", "hammock": "object", "camping chair": "chair", "poles": "structure", "saw": "object", "baskets": "basket", "watering can": "can", "football": "ball", "notebooks": "book", "shoes on shelf": "shoes", "clothes on shelf": "clothes", "rack with shoes": "shoe rack", "set of hangers": "clothes hanger", "stack of clothes": "clothes", "figurine": "object", "door screen": "door", "subwoofer": "speaker", "dish": "plate", "stack of towels": "towel", "shower wall cubby": "shower wall", "stack of shoes": "shoes", "stack of blankets": "blanket", "duvet": "blanket", "stack of pillows": "pillows", "stack of bags": "bag", "set of cosmetics": "object", "detergent bottle": "bottle", "central heating furnace": "heater", "set of valves": "pipe", "closet rod": "closet rod", "chair stand": "chair", "bathroom art": "painting", "ceiling fixture": "ceiling light", "shower glass": "shower door", "bath curtain bar": "shower curtain rod", "bath grab bar": "grab bar", "mascots": "statue", "attic entrance": "door", "shoe shelf": "shoe rack", "guitar stand": "guitar case", "keyboard stand": "keyboard piano", "air vent installation": "vent", "washing powder": "laundry detergent", "electric installation": "electric panel", "tank": "container", "stack of boxes": "boxes", "title": "sign", "dinner chair": "chair", "cabinet /otherroom": "cabinet", "plug": "power outlet", "countertop /otherroom": "counter", "whiteboard": "whiteboard", "dishes": "plate", "flatware": "kitchen cabinet", "garage light": "ceiling light", "aquarium": "fish", "high shelf": "shelf", "clothes bag": "laundry bag", "copier machine": "copier", "partial": "structure", "stairs skirt": "stairs", "roomba": "roomba", "hutch": "cabinet", "icebox": "refrigerator", "stack": "stack of chairs", "wine cabinet": "cabinet", "plant ornament": "potted plant", "fireplace mirror": "mirror", "stand/small table": "end table", "artwork": "painting", "leg rest": "footrest", "counter door": "cabinet door", "artwork frame": "frame", "oil lamp": "lamp", "round chair": "chair", "violin case": "instrument case", "folding stand": "folded chair", "dress": "clothing", "art/statue": "statue", "cassette": "cd case", "clothes container": "clothing rack", "shelf / cabinet": "shelf", "gap": "divider", "alarm controller": "alarm clock", "wall control": "light switch", "flush button": "toilet flush button", "bread bin": "container", "soundbar": "speaker", "game board": "board", "chaise longue": "sofa chair", "lantern": "lamp", "glass door": "door", "shower battery": "shower control valve", "dispenser": "soap dispenser", "glass pane": "window", "basin faucet": "faucet", "lawn": "plant", "garden chair": "bench", "garden bench": "bench", "sky": "ceiling", "garden deck": "object", "ceiling corridor": "ceiling", "router": "object", "cleaning spray": "spray bottle", "elephant sculpture": "statue", "salt and pepper grinder": "salt", "projector screen": "projector screen", "gift": "box", "typewriter": "keyboard", "violin": "guitar", "sheet music": "music book", "kitchen chair": "chair", "cabinet drawer": "drawer", "pitchfork": "object", "hammer": "toolbox", "salver": "tray", "napkin": "towel", "fork": "knife block", "apron": "kitchen apron", "sledge": "object", "chamber pot": "toilet", "floor /otherroom": "floor", "door /otherroom": "door", "painting /otherroom": "painting", "window frame /otherroom": "window", "mannequin": "clothing", "cloth holder": "clothes hanger", "storage": "storage box", "bread": "food container", "bottom of stairs": "stairs", "fire pit": "fireplace", "balcony": "platform", "night lamp": "lamp", "bathroom window": "window", "skirting board": "wall", "pencil": "pen holder", "sticky notes": "paper", "pen": "pen holder", "spoon": "plate", "desk organizer": "organizer", "stapler": "paper", "flush": "toilet flush button", "document holder": "file cabinet", "conference phone": "telephone", "identifier": "object", "shower frame": "shower", "compressor": "machine", "recuperator": "machine", "terrace": "platform", "cook book": "book", "rocking horse": "object", "kitchen counter support": "kitchen counter", "keyboard piano": "piano", "power breaker box": "electric panel", "stack of yarns": "storage box", "den": "object", "sponge": "washcloth", "deck chair": "chair", "torch": "lamp", "trough": "bucket", "gutter": "pipe", "basket of fruits": "basket", "file": "paper", "stack of files": "file cabinet", "shower cockpit": "shower", "shower mirror": "mirror", "shower window frame": "window", "bathroom glass": "mirror", "circular sofa": "sofa", "soap dispenser shelf in shower": "soap dispenser", "recessed cubby": "shelf", "set of armchairs": "armchair", "umbrella stand": "stand", "post": "pillar", "bed cabinet lamp": "lamp", "cross-trainer": "exercise machine", "modem": "machine", "paper holder": "paper organizer", "rock": "object", "bathmat": "mat", "vanity": "bathroom vanity", "window /otherroom": "window", "junk": "trash can", "decorative mask": "decoration", "slab": "platform", "sofa set": "sofa", "pedestal": "stand", "display": "monitor", "lighting fixture": "lamp", "molding": "wall", "model": "object", "niche": "shelf", "billiard balls": "ball", "billiard cues": "stick", "ball pouffe": "seat", "air freshener": "spray bottle", "press": "machine", "pencil holder": "organizer", "hatch": "door", "jacuzzi": "bathtub", "bedding": "bed", "lounger": "chair", "salt and pepper": "dispenser", "kitchen sink cabinet": "sink", "lighter": "object", "fur carpet": "carpet", "bath side table": "end table", "bell": "object", "panel screen": "divider", "fish tank": "container", "skylight": "window", "material": "cloth", "mantle": "shelf", "note board": "bulletin board", "cd": "cd case", "pet bowl": "bowl", "wall top": "wall", "pet bed": "bed", "box with toys": "box", "cage": "container", "birdhouse": "decoration", "box with shoes": "box", "fluorescent light": "light", "medical lamp": "lamp", "paper towels": "towel", "relief": "object", "massage bed": "bed", "hand dryer": "hand dryer", "makeup accessories": "toiletry", "double armchair": "armchair", "bathroom wall": "wall", "art piece": "painting", "foot spa": "bathtub", "spa armchair": "armchair", "spa bathtub": "bathtub", "thermometer": "thermostat", "sauna seat": "seat", "sauna support": "handrail", "sauna oven": "oven", "foam": "sponge", "beverage dispenser": "water cooler", "box with tea": "box", "shelf with cosmetics": "shelf", "product": "object", "advertisement": "poster", "steel plate": "plate", "electric hub": "power outlet", "solarium": "structure", "flush push": "toilet flush button", "cleaner brush": "broom", "solarium door": "door", "bottle dispenser": "dispenser", "floor vent": "vent", "staircase": "stairs", "emergency sign": "exit sign", "bed base": "bed", "dehumidifier": "humidifier", "package": "box", "cyp": "plant", "stack of magazines": "magazine", "stack of binders": "binders", "planner": "calendar", "box of paper": "boxes of paper", "groceries": "food container", "alcohol bottles": "bottles", "cookies": "food display", "kitchenware": "cooking pan", "bag with something": "bag", "set of knives": "knife block", "display of pictures": "pictures", "toothpaste": "toothpaste", "toiletry bag": "toiletry", "sandals": "shoes", "mobile": "telephone", "cd player": "cd case", "breadbox": "storage box", "electric toothbrush": "toothbrush", "eyeglasses": "glass", "art/clutter": "decoration", "photo stand": "picture", "records": "music book", "microphone": "speaker", "shoe cabinet": "shoe rack", "dog leash": "object", "stairwell": "staircase", "flashlight": "light", "backsplash": "wall", "rag": "cloth", "chaise": "seat", "shoe rack": "shoe rack", "playpen": "crib", "shower cabinet": "shower", "chain": "chain", "drill": "object", "tool rack": "rack", "cork board": "bulletin board", "dish dryer": "dish rack", "hand cloth": "hand towel", "chest of drawer": "drawer", "secretary": "desk", "air hockey": "object", "medal": "object", "shower valve": "shower control valve", "ceiling border": "ceiling", "toaster oven": "toaster oven", "foot stand": "footrest", "closet floor": "floor", "recliner": "armchair", "condiment": "salt", "security detector": "smoke detector", "porcelain": "vase", "mail": "mail", "ceiling panel": "ceiling", "cleaning fluid": "laundry detergent", "ceiling lamp rail": "ceiling light", "medical object": "object", "electricity box": "electric panel", "meter": "scale", "electrical installation": "power outlet", "pole": "pillar", "extension lead": "power strip", "coaster": "mat", "bonsai tree": "plant", "window seat": "seat", "dish rack": "dish rack", "cradle": "crib", "tent": "structure", "headset": "headphones", "table pad": "table", "archway": "doorframe", "box of fruit": "box", "laundry bag": "laundry bag", "fireplace brush": "broom", "pantry": "kitchen cabinet", "shower case": "shower", "drawer desk": "desk", "soft chair": "armchair", "workstation": "desk", "moose head/sculpture/hunting trophy": "decoration", "plane": "airplane", "storage space": "storage bin", "cat litter box": "cat litter box", "separator": "divider", "bridge": "structure", "ball pool": "ball", "stage": "platform", "curb": "structure", "sunbed": "couch", "mailbox": "mailbox", "exercise ladder": "ladder", "stones": "structure", "sauna heat rocks": "structure", "barbecue": "object", "bath towels": "towel", "boat model": "boat", "copier": "copier", "bureau": "dresser", "kitchen sink": "sink", "bedside cabinet drawer": "drawer", "bedside cabinet door": "cabinet doors", "bath carpet": "rug", "bathroom cabinet drawer": "drawer", "bathroom cabinet door": "cabinet doors", "shower base": "shower", "tennis racket": "tennis racket", "window/door": "door", "bathtub knob": "bathtub", "set of towels": "towels", "glass container": "container", "stack of trays": "tray", "bathroom rug": "rug", "binders": "binders", "hole puncher": "stapler", "stools": "stool", "mousepad": "keyboard", "shoe case": "shoe rack", "screw box": "toolbox", "electric cord": "power outlet", "bricks": "structure", "plenum box": "box", "water meter": "object", "gauge": "object", "gas meter": "object", "water pump": "machine", "water outlet": "faucet", "crutches": "crutches", "beer crate": "crate", "bread box": "box", "electric plug": "power outlet", "air purifier": "fan", "bicycle helmet": "helmet", "extractor hood": "range hood", "brushes": "toilet brush", "vegetables": "object", "waffle iron": "iron", "bag with sheets": "bag", "lamp shade": "lamp", "cat bed": "bed", "row of theater chairs": "seat", "rice cooker": "rice cooker", "blanket basket": "basket", "blankets": "blanket", "scanner": "scanner", "shower utensil": "soap", "cabinet /w cluttered art": "cabinet", "place mat": "mat", "draw": "drawer", "water fountain": "water fountain", "aisle frame": "frame", "box with jewelry": "jewelry box", "decorative tray": "tray", "decorative vessel": "vase", "ceramics": "vase", "charger": "power outlet", "wifi router": "object", "prop": "decoration", "fishing rod": "rod", "fitness ball": "exercise ball", "exercising blocks": "dumbbell", "drawing": "picture", "foosball table": "foosball table", "baseboard": "board", "gas box": "box", "stone bench": "bench", "ceiling fan lamp": "ceiling fan", "chest bench": "bench", "fur": "rug", "shoulder bag": "bag", "decorative window": "window", "lace doily": "table", "bedroom table": "end table", "blouse": "clothing", "drawer cabinet": "cabinet", "dried flowers": "plant", "decorative frame": "frame", "box with photos": "box", "electric device": "power outlet", "electric freshener": "object", "fishing pole": "rod", "vinyl records": "object", "apple": "object", "card": "card", "star": "star", "ragdoll": "doll", "doily": "table", "ragdoll cat": "object", "pot lid": "pot", "pouches": "bag", "gun": "nerf gun", "handkerchiefs": "cloth", "training mat": "mat", "painting roll": "painting", "painting rolls": "painting", "painting tray": "painting", "controller": "controller", "electrical switchboard": "electric panel", "glue": "tape", "measuring tape": "tape", "ruler": "object", "wrench": "object", "pliers": "object", "screwdriver": "object", "level": "object", "spatula": "object", "square": "object", "roll": "paper towel roll", "cans": "can", "base": "object", "rods": "rod", "baseball cap": "cap", "shade rail": "rail", "crayon": "object", "drawer cart": "cart", "gas furnace": "furnace", "silicone tube": "tube", "drywall board": "wall", "canal": "object", "cat food bag": "object", "buckets": "bucket", "spirit level": "object", "air heater": "heater", "trampoline": "object", "folded table": "folded table", "hoverboard": "hoverboard", "kitchen towel": "towel", "knife stand": "knife block", "food processor": "kitchen mixer", "bottle of wine": "bottle", "frying pan": "frying pan", "cooker hood": "range hood", "ball chair": "chair", "leaflets": "paper", "strongbox": "chest", "dog toy": "toy dinosaur", "console pad": "controller", "console pad charger": "object", "console": "media center", "baby seat": "carseat", "ceiling floor": "ceiling", "drawers for clothes": "dresser", "drinking fountain": "water fountain", "cone": "traffic cone", "albums": "cd case", "sticker book": "book", "price tag": "object", "stack of chairs": "stack of chairs", "magic marker": "pen holder", "music equipment stand": "music stand", "electric percussion": "drum set", "magic marker box": "pen holder", "ceiling chassis": "ceiling", "ceiling hanger": "ceiling", "guitar pedals": "guitar", "ukulele": "guitar", "guitar cases": "guitar case", "guitar case": "guitar case", "guitar case cover": "guitar case", "keyboard cover": "keyboard", "product box": "box", "stack of product boxes": "boxes", "board with keys": "keyboard", "trombone": "instrument case", "trumpet": "instrument case", "saxophone": "instrument case", "clarinet": "instrument case", "tambourine": "instrument case", "product boxes": "boxes", "electric drum": "drum set", "cabinet counter": "kitchen cabinet", "counter desk": "desk", "album": "book", "stack of albums": "stack of chairs", "shop shelf": "shelf", "microphone accessory": "music stand", "audio cable": "power outlet", "audio cables": "power strip", "guitar straps": "guitar case", "t-shirt": "clothing", "stack of t-shirts": "stacks of cups", "socks": "sock", "products": "container", "keyboard box": "keyboard piano", "stack of music stands": "stack of chairs", "round cushion": "pillow", "tripod": "tripod", "baby chair": "armchair", "shoes rack": "shoe rack", "acoustic panel": "divider", "sauna": "shower", "game": "board", "electronics": "computer tower", "music equipment": "keyboard piano", "bunk bed": "bunk bed", "stack of plates": "plates", "antlers": "statue", "menu board": "board", "jars": "jar", "menu": "board", "cleaner bottle": "spray bottle", "receipt printer": "printer", "sombrero": "hat", "shisha": "pipe", "keg": "container", "saturator": "humidifier", "gas container": "container", "spices": "salt", "receipt spike": "paper", "spray can": "spray bottle", "bowls": "bowl", "bottle opener": "bottle", "gloves": "clothing", "stovetop": "stovetop" } S3D_SCANNET = { 1: 'wall', 2: 'floor', 3: 'cabinet', 4: 'bed', 5: 'chair', 6: 'sofa', 7: 'table', 8: 'door', 9: 'window', 10: 'bookshelf', 11: 'picture', 12: 'counter', 13: 'blinds', 14: 'desk', 15: 'shelf', 16: 'curtain', 17: 'dresser', 18: 'pillow', 19: 'mirror', 20: 'mat', 21: 'clothes', 22: 'ceiling', 23: 'books', 24: 'refrigerator', 25: 'tv', 26: 'paper', 27: 'towel', 28: 'shower curtain', 29: 'box', 30: 'whiteboard', 31: 'person', 32: 'nightstand', 33: 'toilet', 34: 'sink', 35: 'lamp', 36: 'bathtub', 37: 'bag', 38: 'otherstructure', 39: 'otherfurniture', 40: 'otherprop'} ================================================ FILE: requirements.txt ================================================ git+https://github.com/openai/CLIP.git git+https://github.com/facebookresearch/fvcore accelerate==0.28.0 addict==2.4.0 antlr4-python3-runtime==4.9.3 appdirs==1.4.4 asttokens==2.4.1 attrs==23.2.0 blinker==1.7.0 certifi==2024.2.2 charset-normalizer==3.3.2 click==8.1.7 comm==0.2.2 ConfigArgParse==1.7 contourpy==1.2.0 cycler==0.12.1 dash==2.16.1 dash-core-components==2.0.0 dash-html-components==2.0.0 dash-table==5.0.0 decorator==5.1.1 docker-pycreds==0.4.0 einops==0.7.0 exceptiongroup==1.2.0 executing==2.0.1 fastjsonschema==2.19.1 filelock==3.13.3 Flask==3.0.2 fonttools==4.50.0 fsspec==2024.3.1 ftfy==6.2.0 fvcore==0.1.5.post20221221 gitdb==4.0.11 GitPython==3.1.42 huggingface-hub==0.22.1 hydra-core==1.3.2 idna==3.6 importlib_metadata==7.1.0 importlib_resources==6.4.0 iopath==0.1.10 ipython==8.18.1 ipywidgets==8.1.2 itsdangerous==2.1.2 jedi==0.19.1 Jinja2==3.1.3 joblib==1.3.2 jsonlines==4.0.0 jsonschema==4.21.1 jsonschema-specifications==2023.12.1 jupyter_core==5.7.2 jupyterlab_widgets==3.0.10 kiwisolver==1.4.5 MarkupSafe==2.1.5 matplotlib==3.8.3 matplotlib-inline==0.1.6 mpmath==1.3.0 nbformat==5.10.3 nest-asyncio==1.6.0 networkx==3.2.1 numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu12==12.1.105 nvidia-cuda-nvrtc-cu12==12.1.105 nvidia-cuda-runtime-cu12==12.1.105 nvidia-cudnn-cu12==8.9.2.26 nvidia-cufft-cu12==11.0.2.54 nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 nvidia-nccl-cu12==2.19.3 nvidia-nvjitlink-cu12==12.4.99 nvidia-nvtx-cu12==12.1.105 omegaconf==2.3.0 open3d==0.18.0 opencv-python==4.9.0.80 packaging==24.0 pandas==2.2.1 parso==0.8.3 pexpect==4.9.0 pillow==10.2.0 platformdirs==4.2.0 plotly==5.20.0 plyfile==1.0.3 portalocker==2.8.2 prompt-toolkit==3.0.43 protobuf==4.25.3 psutil==5.9.8 ptyprocess==0.7.0 pure-eval==0.2.2 Pygments==2.17.2 pyparsing==3.1.2 pyquaternion==0.9.9 python-dateutil==2.9.0.post0 pytz==2024.1 PyYAML==6.0.1 referencing==0.34.0 regex==2023.12.25 requests==2.31.0 retrying==1.3.4 rpds-py==0.18.0 safetensors==0.4.2 scikit-learn==1.4.1.post1 scipy==1.12.0 sentry-sdk==1.42.0 setproctitle==1.3.3 six==1.16.0 smmap==5.0.1 stack-data==0.6.3 sympy==1.12 tabulate==0.9.0 tenacity==8.2.3 termcolor==2.4.0 threadpoolctl==3.4.0 tokenizers==0.15.2 torch==2.2.1 torchvision==0.17.1 tqdm==4.66.2 traitlets==5.14.2 transformers==4.39.1 triton==2.2.0 typing_extensions==4.10.0 tzdata==2024.1 urllib3==2.2.1 wandb==0.16.4 wcwidth==0.2.13 Werkzeug==3.0.1 widgetsnbextension==4.0.10 yacs==0.1.8 zipp==3.18.1 ================================================ FILE: run.py ================================================ from pathlib import Path import hydra from datetime import datetime from omegaconf import OmegaConf, open_dict import wandb import common.io_utils as iu from common.misc import rgetattr from trainer.build import build_trainer @hydra.main(version_base=None, config_path="./configs", config_name="default") def main(cfg): if cfg.resume: assert Path(cfg.exp_dir).exists(), f"Resuming failed: {cfg.exp_dir} does not exist." print(f"Resuming from {cfg.exp_dir}") cfg = OmegaConf.load(Path(cfg.exp_dir) / 'config.yaml') cfg.resume = True else: run_id = wandb.util.generate_id() with open_dict(cfg): cfg.logger.run_id = run_id OmegaConf.resolve(cfg) naming_keys = [cfg.name] for name in cfg.get('naming_keywords', []): if name == "time": continue elif name == "task": naming_keys.append(cfg.task) if rgetattr(cfg, "data.note", None) is not None: naming_keys.append(rgetattr(cfg, "data.note")) else: datasets = rgetattr(cfg, "data.train") dataset_names = "+".join([str(x) for x in datasets]) naming_keys.append(dataset_names) elif name == "dataloader.batchsize": naming_keys.append(f"b{rgetattr(cfg, name) * rgetattr(cfg, 'num_gpu')}") else: if str(rgetattr(cfg, name)) != "": naming_keys.append(str(rgetattr(cfg, name))) exp_name = "_".join(naming_keys) if rgetattr(cfg, "debug.flag", False): exp_name = "Debug_test" print(exp_name) # Record the experiment if not cfg.exp_dir: cfg.exp_dir = Path(cfg.base_dir) / exp_name / f"{datetime.now().strftime('%Y-%m-%d-%H:%M:%S.%f')}" else: cfg.exp_dir = Path(cfg.exp_dir) iu.make_dir(cfg.exp_dir) OmegaConf.save(cfg, cfg.exp_dir / "config.yaml") trainer = build_trainer(cfg) trainer.run() if __name__ == "__main__": main() ================================================ FILE: trainer/__init__.py ================================================ from .default_trainer import * from .openvocab_trainer import * from .objpretrain_trainer import * from .debug_trainer import * ================================================ FILE: trainer/build.py ================================================ import copy as cp import glob from datetime import timedelta from pathlib import Path from omegaconf import OmegaConf from omegaconf import open_dict from tqdm import tqdm import numpy as np from accelerate import Accelerator, DistributedDataParallelKwargs from accelerate.logging import get_logger from accelerate.utils import set_seed, InitProcessGroupKwargs from fvcore.common.registry import Registry import torch import wandb import common.io_utils as iu from common.io_utils import make_dir import common.misc as misc from data.build import build_dataloader from evaluator.build import build_eval from model.build import build_model from optim.build import build_optim TRAINER_REGISTRY = Registry("Trainer") class Tracker(): def __init__(self, cfg): self.reset(cfg) def step(self): self.epoch += 1 def reset(self, cfg): self.exp_name = f"{cfg.exp_dir.parent.name.replace(f'{cfg.name}', '').lstrip('_')}/{cfg.exp_dir.name}" self.epoch = 0 self.best_result = -np.inf def state_dict(self): return {k: v for k, v in self.__dict__.items() if not k.startswith('__')} def load_state_dict(self, state_dict): self.__dict__.update(state_dict) @TRAINER_REGISTRY.register() class BaseTrainer(): def __init__(self, cfg): set_seed(cfg.rng_seed) self.debug = cfg.debug.get("flag", False) self.hard_debug = cfg.debug.get("hard_debug", False) self.epochs_per_eval = cfg.solver.get("epochs_per_eval", None) self.epochs_per_save = cfg.solver.get("epochs_per_save", None) self.global_step = 0 # Initialize accelerator self.exp_tracker = Tracker(cfg) wandb_args = {"entity": cfg.logger.entity, "id": cfg.logger.run_id, "resume": cfg.resume} if not cfg.logger.get('autoname'): wandb_args["name"] = self.exp_tracker.exp_name # There is bug in logger setting, needs fixing from accelerate side self.logger = get_logger(__name__) self.mode = cfg.mode ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) init_kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=5400)) kwargs = ([ddp_kwargs] if cfg.num_gpu > 1 else []) + [init_kwargs] gradient_accumulation_steps = cfg.solver.get("gradient_accumulation_steps", 1) self.accelerator = Accelerator( gradient_accumulation_steps=gradient_accumulation_steps, log_with=cfg.logger.name, kwargs_handlers=kwargs ) if not self.hard_debug: self.accelerator.init_trackers( project_name=cfg.name if not self.debug else "Debug", config=OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) if not cfg.resume else None, init_kwargs={"wandb": wandb_args} ) print(OmegaConf.to_yaml(cfg)) if cfg.model.name == 'Query3D': # choose whether to load mv or voxel features based on model.memories for Query3D # TODO: a better way to do this? if 'mv' in cfg.model.memories or 'sem' in cfg.model.memories: cfg.data.load_multiview_info = True if 'voxel' in cfg.model.memories or 'sem' in cfg.model.memories: cfg.data.load_mask3d_voxel = True txt_model2tokenizer = {'BERTLanguageEncoder': 'bert-base-uncased', 'CLIPLanguageEncoder': 'openai/clip-vit-large-patch14'} cfg.data_wrapper.tokenizer = txt_model2tokenizer[cfg.model.txt_encoder.name] keys = ["train", "val", "test"] if self.mode == "train" else ["test"] self.data_loaders = {key : build_dataloader(cfg, split=key) for key in keys} self.model = build_model(cfg) if self.mode == "test": total_steps = 1 else: total_steps = (len(self.data_loaders["train"]) * cfg.solver.epochs) // gradient_accumulation_steps self.loss, self.optimizer, self.scheduler = build_optim(cfg, self.model.get_opt_params(), total_steps= total_steps) if misc.rgetattr(cfg, "eval.pass_kwargs", False): kwargs = {"dataloaders": self.data_loaders} else: kwargs = {} self.evaluator = build_eval(cfg, self.accelerator, **kwargs) # Training details self.epochs = cfg.solver.epochs self.total_steps = 1 if self.mode == "test" else len(self.data_loaders["train"]) * cfg.solver.epochs self.grad_norm = cfg.solver.get("grad_norm") # Load pretrain model weights if cfg.get('pretrain_ckpt_path'): self.pretrain_ckpt_path = Path(cfg.pretrain_ckpt_path) self.load_pretrain() # Accelerator preparation self.model, self.loss, self.optimizer, self.scheduler = self.accelerator.prepare(self.model, self.loss, self.optimizer, self.scheduler) for name, loader in self.data_loaders.items(): if isinstance(loader, list): loader = self.accelerator.prepare(*loader) else: loader = self.accelerator.prepare(loader) self.data_loaders[name] = loader self.accelerator.register_for_checkpointing(self.exp_tracker) # Check if resuming from previous checkpoint is needed self.ckpt_path = Path(cfg.ckpt_path) if cfg.get("ckpt_path") else Path(cfg.exp_dir) / "ckpt" / "best.pth" if cfg.resume: self.resume() def forward(self, data_dict): return self.model(data_dict) def backward(self, loss): # Need to be reimplemented when using different sets of optimizer and schedulers self.optimizer.zero_grad() self.accelerator.backward(loss) if self.grad_norm is not None and self.accelerator.sync_gradients: self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() self.scheduler.step() def log(self, results, mode="train"): if not self.hard_debug: log_dict = {} for key, val in results.items(): if isinstance(val, torch.Tensor): val = val.item() log_dict[f"{mode}/{key}"] = val if mode == "train": lrs = self.scheduler.get_lr() for i, lr in enumerate(lrs): log_dict[f"{mode}/lr/group_{i}"] = lr self.accelerator.log(log_dict, step=self.global_step) def save(self, name): make_dir(self.ckpt_path.parent) self.save_func(str(self.ckpt_path.parent / name)) def resume(self): if self.ckpt_path.exists(): print(f"Resuming from {str(self.ckpt_path)}") # self.logger.info(f"Resuming from {str(self.ckpt_path)}") self.accelerator.load_state(str(self.ckpt_path)) # self.logger.info(f"Successfully resumed from {self.ckpt_path}") print(f"Successfully resumed from {self.ckpt_path}") else: self.logger.info("training from scratch") def load_pretrain(self): self.logger.info(f"Loading pretrained weights from {str(self.pretrain_ckpt_path)}") model_weight_path_pattern = str(self.pretrain_ckpt_path / "pytorch_model*.bin") model_weight_paths = glob.glob(model_weight_path_pattern) if len(model_weight_paths) == 0: raise FileNotFoundError(f"Cannot find pytorch_model.bin in {str(self.pretrain_ckpt_path)}") weights = {} for model_weight_path in model_weight_paths: weights.update(torch.load(model_weight_path, map_location="cpu")) warning = self.model.load_state_dict(weights, strict=False) self.logger.info(f"Successfully loaded from {str(self.pretrain_ckpt_path)}: {warning}") def save_func(self, path): self.accelerator.save_state(path) def build_trainer(cfg): return TRAINER_REGISTRY.get(cfg.trainer)(cfg) ================================================ FILE: trainer/debug_trainer.py ================================================ import copy from tqdm import tqdm import torch from trainer.build import TRAINER_REGISTRY from trainer.build import BaseTrainer @TRAINER_REGISTRY.register() class DebugTrainer(BaseTrainer): def __init__(self, cfg): super().__init__(cfg) self.best_metric = -1 def forward(self, data_dict): return self.model(data_dict) def backward(self, loss): self.optimizer.zero_grad() self.accelerator.backward(loss) if self.grad_norm is not None and self.accelerator.sync_gradients: self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() self.scheduler.step() def train_step(self, epoch): self.model.train() loader = self.data_loaders["train"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process), desc=f"[Epoch {epoch + 1}/{self.epochs}]") for i, data_dict in enumerate(loader): with self.accelerator.accumulate(self.model): data_dict['cur_step'] = epoch * len(loader) + i data_dict['total_steps'] = self.total_steps # forward pbar.update(1) @torch.no_grad() def eval_step(self, epoch): self.model.eval() loader = self.data_loaders["val"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): pbar.update(1) return @torch.no_grad() def test_step(self): self.model.eval() loader = self.data_loaders["test"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): pbar.update(1) return def run(self): if self.mode == "train": start_epoch = self.exp_tracker.epoch self.global_step = start_epoch * len(self.data_loaders["train"]) for epoch in range(start_epoch, self.epochs): self.exp_tracker.step() self.train_step(epoch) if self.epochs_per_eval and (epoch + 1) % self.epochs_per_eval == 0: self.eval_step(epoch) break self.test_step() if self.mode == "train": self.accelerator.end_training() ================================================ FILE: trainer/default_trainer.py ================================================ import copy from tqdm import tqdm import torch from trainer.build import TRAINER_REGISTRY from trainer.build import BaseTrainer @TRAINER_REGISTRY.register() class DefaultTrainer(BaseTrainer): def __init__(self, cfg): super().__init__(cfg) self.best_metric = -1 def forward(self, data_dict): return self.model(data_dict) def backward(self, loss): self.optimizer.zero_grad() self.accelerator.backward(loss) if self.grad_norm is not None and self.accelerator.sync_gradients: self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() self.scheduler.step() def train_step(self, epoch): self.model.train() loader = self.data_loaders["train"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process), desc=f"[Epoch {epoch + 1}/{self.epochs}]") for i, data_dict in enumerate(loader): with self.accelerator.accumulate(self.model): data_dict['cur_step'] = epoch * len(loader) + i data_dict['total_steps'] = self.total_steps # forward data_dict = self.forward(data_dict) # calculate loss loss, losses = self.loss(data_dict) # calculate evaluator metrics = self.evaluator.batch_metrics(data_dict) # optimize self.backward(loss) # record self.global_step += 1 log_dict = {'step': self.global_step} log_dict.update(losses) log_dict.update(metrics) self.log(log_dict, mode="train") pbar.update(1) @torch.no_grad() def eval_step(self, epoch): self.model.eval() loader = self.data_loaders["val"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): data_dict = self.forward(data_dict) self.evaluator.update(data_dict) pbar.update(1) is_best, results = self.evaluator.record() if is_best: self.best_metric = results["target_metric"] self.log(results, mode="val") self.evaluator.reset() return is_best @torch.no_grad() def test_step(self): self.model.eval() loader = self.data_loaders["test"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): data_dict = self.forward(data_dict) self.evaluator.update(data_dict) pbar.update(1) is_best, results = self.evaluator.record() self.log(results, mode="test") self.evaluator.reset() return results def run(self): if self.mode == "train": start_epoch = self.exp_tracker.epoch self.global_step = start_epoch * len(self.data_loaders["train"]) for epoch in range(start_epoch, self.epochs): self.exp_tracker.step() self.train_step(epoch) if self.epochs_per_eval and (epoch + 1) % self.epochs_per_eval == 0: is_best = self.eval_step(epoch) self.accelerator.print(f"[Epoch {epoch + 1}/{self.epochs}] finished eval, is_best: {is_best}") else: is_best = False self.accelerator.wait_for_everyone() if self.accelerator.is_main_process: self.save("latest.pth") if is_best: self.save("best.pth") if self.epochs_per_save and (epoch + 1) % self.epochs_per_save == 0: self.save(f"ckpt_{epoch+1}.pth") self.test_step() if self.mode == "train": self.accelerator.end_training() ================================================ FILE: trainer/objpretrain_trainer.py ================================================ import copy from tqdm import tqdm import torch from trainer.build import TRAINER_REGISTRY from trainer.build import BaseTrainer @TRAINER_REGISTRY.register() class ObjPretrainTrainer(BaseTrainer): def __init__(self, cfg): super().__init__(cfg) self.best_metric = -1 def forward(self, data_dict): return self.model(data_dict) def backward(self, loss): self.optimizer.zero_grad() self.accelerator.backward(loss) if self.grad_norm is not None and self.accelerator.sync_gradients: self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() self.scheduler.step() def train_step(self, epoch): self.model.train() loader = self.data_loaders["train"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process), desc=f"[Epoch {epoch + 1}/{self.epochs}]") for i, data_dict in enumerate(loader): with self.accelerator.accumulate(self.model): # forward data_dict = self.forward(data_dict) # calculate loss loss, losses = self.loss(data_dict) # calculate evaluator metrics = self.evaluator.batch_metrics(data_dict) # optimize self.backward(loss) # record self.global_step += 1 log_dict = {'step': self.global_step} log_dict.update(losses) log_dict.update(metrics) self.log(log_dict, mode="train") pbar.update(1) @torch.no_grad() def eval_step(self, epoch): self.model.eval() loader = self.data_loaders["val"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): data_dict = self.forward(data_dict) # data_dict = { # k : v.contiguous() for k, v in data_dict.items() if isinstance(v, torch.Tensor) # and k not in ['voxel_features', 'v2p_map', 'voxel_coords'] # } # data_dict = self.accelerator.gather_for_metrics(data_dict) self.evaluator.update(data_dict) pbar.update(1) is_best, results = self.evaluator.record() if is_best: self.best_metric = results["target_metric"] self.log(results, mode="val") self.evaluator.reset() return is_best @torch.no_grad() def test_step(self): self.model.eval() loader = self.data_loaders["test"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): data_dict = self.forward(data_dict) # data_dict = { # k : v.contiguous() for k, v in data_dict.items() if isinstance(v, torch.Tensor) # and k not in ['voxel_features', 'v2p_map', 'voxel_coords'] # } self.evaluator.update(data_dict) pbar.update(1) is_best, results = self.evaluator.record() self.log(results, mode="test") self.evaluator.reset() return results def run(self): if self.mode == "train": start_epoch = self.exp_tracker.epoch self.global_step = start_epoch * len(self.data_loaders["train"]) for epoch in range(start_epoch, self.epochs): self.exp_tracker.step() self.train_step(epoch) if self.epochs_per_eval and (epoch + 1) % self.epochs_per_eval == 0: is_best = self.eval_step(epoch) self.accelerator.print(f"[Epoch {epoch + 1}/{self.epochs}] finished eval, is_best: {is_best}") else: is_best = False self.accelerator.wait_for_everyone() if self.accelerator.is_main_process: if is_best: self.save("best.pth") if self.epochs_per_save and (epoch + 1) % self.epochs_per_save == 0: self.save(f"ckpt_{epoch+1}.pth") self.test_step() if self.mode == "train": self.accelerator.end_training() ================================================ FILE: trainer/openvocab_trainer.py ================================================ import copy from tqdm import tqdm import torch from trainer.build import TRAINER_REGISTRY from trainer.build import BaseTrainer @TRAINER_REGISTRY.register() class OpenVocabTrainer(BaseTrainer): def __init__(self, cfg): super().__init__(cfg) self.best_metric = -1 def forward(self, data_dict): return self.model(data_dict) def backward(self, loss): self.optimizer.zero_grad() self.accelerator.backward(loss) if self.grad_norm is not None and self.accelerator.sync_gradients: self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() self.scheduler.step() def train_step(self, epoch): self.model.train() loader = self.data_loaders["train"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process), desc=f"[Epoch {epoch + 1}/{self.epochs}]") for i, data_dict in enumerate(loader): with self.accelerator.accumulate(self.model): # forward data_dict = self.forward(data_dict) # calculate loss loss, losses = self.loss(data_dict) # calculate evaluator metrics = self.evaluator["train"].batch_metrics(data_dict) # optimize self.backward(loss) # record self.global_step += 1 log_dict = {'step': self.global_step} log_dict.update(losses) log_dict.update(metrics) self.log(log_dict, mode="train") pbar.update(1) @torch.no_grad() def eval_step(self, epoch): self.model.eval() loader = self.data_loaders["val"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): data_dict = self.forward(data_dict) self.evaluator["val"].update(data_dict) pbar.update(1) is_best, results = self.evaluator["val"].record() if is_best: self.best_metric = results["target_metric"] self.log(results, mode="val") self.evaluator["val"].reset() return is_best @torch.no_grad() def test_step(self): self.model.eval() loader = self.data_loaders["test"] pbar = tqdm(range(len(loader)), disable=(not self.accelerator.is_main_process)) for i, data_dict in enumerate(loader): data_dict = self.forward(data_dict) # data_dict = { # k: v.contiguous() for k, v in data_dict.items() if isinstance(v, torch.Tensor) # } # data_dict = self.accelerator.gather_for_metrics(data_dict) self.evaluator["val"].update(data_dict) pbar.update(1) is_best, results = self.evaluator["val"].record() self.log(results, mode="test") self.evaluator["val"].reset() return results def run(self): if self.mode == "train": start_epoch = self.exp_tracker.epoch self.global_step = start_epoch * len(self.data_loaders["train"]) for epoch in range(start_epoch, self.epochs): self.exp_tracker.step() self.train_step(epoch) # with torch.profiler.profile(record_shapes=True) as prof_train: # with torch.profiler.record_function("model_inference"): # self.train_step(epoch) # print(prof_train.key_averages().table(sort_by="cpu_time_total", row_limit=20)) if self.epochs_per_eval and (epoch + 1) % self.epochs_per_eval == 0: is_best = self.eval_step(epoch) # with torch.profiler.profile(record_shapes=True) as prof: # with torch.profiler.record_function("model_inference"): # is_best = self.eval_step(epoch) # print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20)) self.accelerator.print(f"[Epoch {epoch + 1}/{self.epochs}] finished eval, is_best: {is_best}") else: is_best = False self.accelerator.wait_for_everyone() if self.accelerator.is_main_process: if is_best: self.save("best.pth") if self.epochs_per_save and (epoch + 1) % self.epochs_per_save == 0: self.save(f"ckpt_{epoch+1}.pth") self.test_step() if self.mode == "train": self.accelerator.end_training() ================================================ FILE: visualize_data.py ================================================ import argparse import random import json from pathlib import Path import numpy as np import torch import open3d as o3d def convert_pc_to_box(obj_pc): xmin = np.min(obj_pc[:,0]) ymin = np.min(obj_pc[:,1]) zmin = np.min(obj_pc[:,2]) xmax = np.max(obj_pc[:,0]) ymax = np.max(obj_pc[:,1]) zmax = np.max(obj_pc[:,2]) center = [(xmin+xmax)/2, (ymin+ymax)/2, (zmin+zmax)/2] box_size = [xmax-xmin, ymax-ymin, zmax-zmin] return center, box_size def load_scan(pcd_path, inst2label_path, scene_name): pcd_data = torch.load(pcd_path / f'{scene_name}.pth') inst_to_label = torch.load(inst2label_path / f"{scene_name}.pth") points, colors, instance_labels = pcd_data[0], pcd_data[1], pcd_data[-1] pcds = np.concatenate([points, colors], 1) return points, colors, pcds, instance_labels, inst_to_label def visualize_one_scene(obj_pcds, points, colors, caption): # visualize scene o3d_pcd = o3d.geometry.PointCloud() o3d_pcd.points = o3d.utility.Vector3dVector(points) o3d_pcd.colors = o3d.utility.Vector3dVector(colors / 255.0) # visualize gt for idx, (obj, obj_label) in enumerate(obj_pcds): if idx > 3: break gt_center, gt_size = convert_pc_to_box(obj) gt_o3d_box = o3d.geometry.OrientedBoundingBox(gt_center, np.eye(3,3), gt_size) gt_o3d_box.color = [0, 1, 0] mesh_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.6, origin=[-0, -0, -0]) o3d.visualization.draw_geometries([o3d_pcd, gt_o3d_box, mesh_frame], window_name=obj_label+'_'+caption) def visualize_data(save_root, scene_name, vis_obj=True): inst2label_path = save_root / 'instance_id_to_label' pcd_path = save_root / 'pcd_with_global_alignment' points, colors, pcds, instance_labels, inst_to_label = load_scan(pcd_path, inst2label_path, scene_name) if not vis_obj: o3d_pcd = o3d.geometry.PointCloud() mesh_frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.6, origin=[-0, -0, -0]) o3d_pcd.points = o3d.utility.Vector3dVector(points) o3d_pcd.colors = o3d.utility.Vector3dVector(colors / 255.0) o3d.visualization.draw_geometries([mesh_frame, o3d_pcd]) return obj_pcds = [] for i in inst_to_label.keys(): mask = instance_labels == i # time consuming if np.sum(mask) == 0: continue obj_pcds.append((pcds[mask], inst_to_label[i])) visualize_one_scene(obj_pcds, points, colors, scene_name) def visualize_refer(save_root, anno_file): inst2label_path = save_root / 'instance_id_to_label' pcd_path = save_root / 'pcd_with_global_alignment' json_data = json.load(open(anno_file, 'r', encoding='utf8')) for item in json_data: scan_id = item['scan_id'] inst2label_path = save_root / 'instance_id_to_label' pcd_path = save_root / 'pcd_with_global_alignment' inst_to_label = torch.load(inst2label_path / f"{scan_id}.pth") pcd_data = torch.load(pcd_path / f'{scan_id}.pth') points, colors, instance_labels = pcd_data[0], pcd_data[1], pcd_data[-1] pcds = np.concatenate([points, colors], 1) target_id = int(item['target_id']) mask = instance_labels == target_id if np.sum(mask) == 0: continue obj_pcds = [(pcds[mask], inst_to_label[target_id])] visualize_one_scene(obj_pcds, points, colors, scan_id+'-'+item['utterance']) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-r", "--root", required=True, type=str, help="path of dataset dir") parser.add_argument("-d", "--dataset", type=str, help="available datasets in ['ARKitScenes', 'HM3D', 'MultiScan', 'ProcThor', \ 'Structured3D', 'ScanNet', '3RScan']") parser.add_argument("--vis_refer", action="store_true", help="visualize reference data") parser.add_argument("-a", "--anno", type=str, default="ssg_ref_rel2_template.json", help="the annotation file for reference") args = parser.parse_args() dataset = args.dataset assert dataset in ['ARKitScenes', 'HM3D', 'MultiScan', 'ProcThor', 'Structured3D', 'ScanNet', '3RScan'] print(dataset) data_root = Path(args.root) / dataset if args.vis_refer: if dataset == 'ScanNet': anno_file = data_root / 'annotations/refer' / args.anno else: anno_file = data_root / 'annotations' / args.anno visualize_refer(data_root / 'scan_data', anno_file) else: all_scans = (data_root / 'scan_data' / 'pcd_with_global_alignment').glob('*.pth') scene_id = Path(random.choice(list(all_scans))).stem visualize_data(data_root / 'scan_data', scene_id)