Full Code of scene-verse/SceneVerse for AI

main 140d172620c3 cached

161 files

848.7 KB

244.7k tokens

606 symbols

1 requests

Download .txt

Showing preview only (898K chars total). Download the full file or copy to clipboard to get everything.

Repository: scene-verse/SceneVerse
Branch: main
Commit: 140d172620c3
Files: 161
Total size: 848.7 KB

Directory structure:
gitextract_y3zcu4ix/

├── .gitignore
├── DATA.md
├── LICENSE
├── README.md
├── TRAIN.md
├── common/
│   ├── box_utils.py
│   ├── dist_utils.py
│   ├── io_utils.py
│   ├── launch_utils.py
│   ├── misc.py
│   └── type_utils.py
├── configs/
│   └── final/
│       ├── all_anno.yaml
│       ├── all_nomlm.yaml
│       ├── all_noobj.yaml
│       ├── all_noscene.yaml
│       ├── all_pretrain.yaml
│       ├── all_pretrain_125.yaml
│       ├── all_pretrain_25.yaml
│       ├── all_pretrain_50.yaml
│       ├── all_pretrain_75.yaml
│       ├── all_pretrain_objcap.yaml
│       ├── all_pretrain_objcap_notemplate.yaml
│       ├── all_pretrain_s3d.yaml
│       ├── all_pretrain_unfreeze.yaml
│       ├── all_rewrite.yaml
│       ├── all_template.yaml
│       ├── all_wo_both.yaml
│       ├── all_wo_both_125.yaml
│       ├── all_wo_both_25.yaml
│       ├── all_wo_both_50.yaml
│       ├── all_wo_multiscan.yaml
│       ├── all_wo_scannet.yaml
│       ├── debug.yaml
│       ├── finetune/
│       │   ├── multiscan_finetune.yaml
│       │   ├── multiscan_woL.yaml
│       │   ├── nr3d_finetune.yaml
│       │   ├── scannet_woL.yaml
│       │   ├── scanqa_finetune.yaml
│       │   ├── scanrefer_finetune.yaml
│       │   ├── sqa3d_finetune.yaml
│       │   └── sr3d_finetune.yaml
│       ├── multiscan_only.yaml
│       ├── nr3d_only.yaml
│       ├── procthor_only.yaml
│       ├── s3d_only.yaml
│       ├── scanrefer_only.yaml
│       ├── scanrefer_only_gttest.yaml
│       └── sr3d_only.yaml
├── data/
│   ├── __init__.py
│   ├── build.py
│   ├── data_utils.py
│   └── datasets/
│       ├── __init__.py
│       ├── arkitscene.py
│       ├── base.py
│       ├── constant.py
│       ├── data_augmentor.py
│       ├── dataset_wrapper.py
│       ├── hm.py
│       ├── multiscan.py
│       ├── procthor.py
│       ├── rscan.py
│       ├── scannet.py
│       ├── scannet_base.py
│       ├── scannet_old.py
│       └── structure3d.py
├── evaluator/
│   ├── __init__.py
│   ├── build.py
│   ├── objcls_eval.py
│   ├── pretrain_eval.py
│   ├── referit3d_eval.py
│   ├── scanqa_eval.py
│   ├── scanrefer_eval.py
│   └── sqa3d_eval.py
├── launch.py
├── model/
│   ├── __init__.py
│   ├── build.py
│   ├── objcls.py
│   └── openvocab.py
├── modules/
│   ├── __init__.py
│   ├── build.py
│   ├── grounding/
│   │   ├── __init__.py
│   │   └── unified_encoder.py
│   ├── heads/
│   │   ├── __init__.py
│   │   ├── grounding_head.py
│   │   ├── pretrain_head.py
│   │   └── qa_head.py
│   ├── language/
│   │   ├── __init__.py
│   │   ├── bert.py
│   │   └── clip.py
│   ├── layers/
│   │   ├── pointnet.py
│   │   └── transformers.py
│   ├── third_party/
│   │   ├── __init__.py
│   │   └── pointnet2/
│   │       ├── _ext_src/
│   │       │   ├── include/
│   │       │   │   ├── ball_query.h
│   │       │   │   ├── cuda_utils.h
│   │       │   │   ├── group_points.h
│   │       │   │   ├── interpolate.h
│   │       │   │   ├── sampling.h
│   │       │   │   └── utils.h
│   │       │   └── src/
│   │       │       ├── ball_query.cpp
│   │       │       ├── ball_query_gpu.cu
│   │       │       ├── bindings.cpp
│   │       │       ├── group_points.cpp
│   │       │       ├── group_points_gpu.cu
│   │       │       ├── interpolate.cpp
│   │       │       ├── interpolate_gpu.cu
│   │       │       ├── sampling.cpp
│   │       │       └── sampling_gpu.cu
│   │       ├── _version.py
│   │       ├── pointnet2_modules.py
│   │       ├── pointnet2_test.py
│   │       ├── pointnet2_utils.py
│   │       ├── pytorch_utils.py
│   │       ├── requirements_new.txt
│   │       └── setup.py
│   ├── utils.py
│   ├── vision/
│   │   ├── __init__.py
│   │   ├── obj_cls_encoder.py
│   │   └── pcd_openvocab_encoder.py
│   └── weights.py
├── optim/
│   ├── __init__.py
│   ├── build.py
│   ├── loss/
│   │   ├── __init__.py
│   │   ├── contra_loss.py
│   │   └── loss.py
│   ├── optimizer/
│   │   ├── __init__.py
│   │   └── optim.py
│   ├── scheduler.py
│   └── utils.py
├── preprocess/
│   ├── README.md
│   ├── __init__.py
│   ├── arkitscenes.py
│   ├── build.py
│   ├── multiscan.py
│   ├── rscan.py
│   ├── sceneverse2hmsemantic.py
│   ├── ssg/
│   │   ├── README.md
│   │   ├── relationships/
│   │   │   ├── camera.py
│   │   │   ├── hanging.py
│   │   │   ├── init.py
│   │   │   ├── multi_objs.py
│   │   │   ├── proximity.py
│   │   │   └── support.py
│   │   ├── ssg_data/
│   │   │   ├── dictionary.py
│   │   │   ├── script/
│   │   │   │   └── ObjNode.py
│   │   │   └── ssg_visualize.py
│   │   ├── ssg_main.py
│   │   └── ssg_utils.py
│   ├── structured3d.py
│   └── utils/
│       ├── __init__.py
│       ├── align_utils.py
│       ├── constant.py
│       └── label_convert.py
├── requirements.txt
├── run.py
├── trainer/
│   ├── __init__.py
│   ├── build.py
│   ├── debug_trainer.py
│   ├── default_trainer.py
│   ├── objpretrain_trainer.py
│   └── openvocab_trainer.py
└── visualize_data.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


================================================
FILE: DATA.md
================================================

## Data

* Note: As some of our users requested the mapping between HM3D object id in SceneVerse to HM3D-semantics, we have added an additional file ([HM3D_tgtID2objID.zip](assets/HM3D_tgtID2objID.zip)) to obtain this mapping. The json file for each scene contains a dictionary of ```{<sceneverse_objid>:[hm3d_objid, hm3d_label]}```.

### Data Processing

We release a data preprocessing exemplar for 3RScan, MultiScan, ARKitScenes and Structured3D, with more details [here](preprocess/README.md).

We also release the [scripts](preprocess/ssg/README.md) for scene graph generation.

### Data Download
We currently host our data on G-drive and request all applicants to fill out the form from [here](https://forms.gle/AXMk7MH6bFXpCqd99).

You should see one or multiple zip file segments for each dataset we provided. For datasets with multiple segments (e.g., ARKitScenes), you can unzip the files with:

```shell
# Directories with multiple zip segments
$ ls ARKitScenes/
  -> ARKitScenes.zip  ARKitScenes.z01

# Unzip from all zip segments
$ cd ARKitScenes/
$ zip -F ARKitScenes.zip --out combined.zip
$ unzip combined.zip
```

After unzipping, the files are organized as:
```shell
ARKitScenes/
|-- scan_data                   # Point cloud data
  |-- instance_id_to_label      # Reorganized instance id to label mapping
  |-- pcd_with_global_alignment # Aligned scene point clouds
|-- annotations                 # Language annotations
  |-- splits
    |-- train_split.txt         # For all datasets, we provide training split
    |-- val_split.txt           # For datasets with evaluation sets
  |-- <language_type>.json      # For datasets except for ScanNet, language for ScanNet is located at annotations/refer
```

### Data Visualization
For data browsing, we experimented with NVIDIA CUDA 11.8 on Ubuntu 22.04 and require the following steps:
```shell
$ conda create -n sceneverse python=3.9
$ pip install torch==2.2.0 torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cu118
$ pip install numpy open3d
```

We provide a short script for visualizing scene and language data, you can use it with:
```shell
# Visualize scene and instance data
$ python visualize_data.py --root <PATH_TO_DOWNLOAD> --dataset <DATASET>
# Visualize language data
$ python visualize_data.py --root <PATH_TO_DOWNLOAD> --dataset <DATASET> --vis_refer
```

As our data contains scenes from existing datasets, please read carefully about the term of use for each dataset we provided in the form.

### Provided Language Types

We list the available data in the current version of SceneVerse in the table below:

|   Dataset    | Object Caption | Scene Caption | Ref-Annotation   | Ref-Pairwise<br>```rel2``` | Ref-MultiObject<br>```relm``` | Ref-Star<br>```star``` | Ref-Chain (Optional)<br>```chain``` |
|:------------:|:--------------:|:-------------:|------------------|-------------------------|-------------------------------|-----------------------|------------------------------------|
|   ScanNet    |       ✅        |       ✅       | ScanRefer<br>Nr3D | ✅              | ✅                             | ✅           | ✅       |
|  MultiScan   |       ✅        |       ✅       | ✅ | ✅              | ✅                             | ✅           | ✅       |
| ARKitScenes  |       ✅        |       ✅       | ✅ | ✅              | ✅                             | ✅           | ✅       |
|     HM3D     |  ```template```   |       ✅       | ✅ | ✅              | ✅                             | ✅           | ✅       |
|    3RScan    |       ✅        |       ✅       | ❌ | ✅              | ✅                             | ✅           | ✅       |
| Structured3D | ```template``` |       ✅       | ❌ | ✅              | ✅                             | ✅           |    ❌     |
|   ProcTHOR   | ```template``` |    ❌     | ❌ | ```template```              | ```template```                   | ```template```            |    ❌     |

For the generated object referrals, we provide both the direct template-based generations ```template``` and the LLM-refined versions ```gpt```.
Please refer to our supplementary for the description of selected ```pair-wise``` / ```multi-object``` / ```star``` types. We also
provide the ```chain``` type which contains language using obejct A to refer B and then B to refer the target object C. As we found 
the ```chain``` type could sometimes lead to unnatural descriptions, we did not discuss it in the main paper. Feel free to inspect
and use it in your projects.

For the remaining data, we hope to further refine and update our data in the following weeks, stay tuned!


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 scene-verse

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
<h2 align="center">
  <span><img src="assets/logo025.png" width="4%" style="transform: translate(0,9px)"></span>
  <b>SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene Understanding</b>
</h2>

<div align="center" margin-bottom="6em">
<a target="_blank" href="https://buzz-beater.github.io/">Baoxiong Jia<sup>✶</sup></a>,
<a target="_blank" href="https://yixchen.github.io/">Yixin Chen<sup>✶</sup></a>,
<a target="_blank" href="https://scholar.google.com/citations?user=fKRgnIMAAAAJ/">Huangyue Yu</a>,
<a target="_blank" href="https://github.com/jetpackfirstme">Yan Wang</a>,
<a target="_blank" href="https://nxsedson.github.io/">Xuesong Niu</a>,
<a target="_blank" href="https://tengyu.ai/">Tengyu Liu</a>,
<a target="_blank" href="https://liqing-ustc.github.io/">Qing Li</a>,
<a target="_blank" href="https://siyuanhuang.com/">Siyuan Huang</a>

</div>
&nbsp;

<div align="center">
    <a href="https://arxiv.org/abs/2401.09340" target="_blank">
    <img src="https://img.shields.io/badge/Paper-arXiv-deepgreen" alt="Paper arXiv"></a>
    <a href="https://scene-verse.github.io" target="_blank">
    <img src="https://img.shields.io/badge/Project-Page-9cf" alt="Project Page"></a>
    <a href="https://youtu.be/UnujS0EVxKU" target="_blank">
    <img src="https://img.shields.io/badge/Video-YouTube-9966ff" alt="Video"></a>
    <a href="https://scene-verse.github.io" target="_blank">
    <img src="https://img.shields.io/badge/Data-SceneVerse-blue" alt="Data"></a>
    <a href="https://scene-verse.github.io" target="_blank">
    <img src="https://img.shields.io/badge/Model-GPS-darkorange" alt="Model"></a>
</div>
&nbsp;

<div align="left">
<img src="assets/overview.png" width="99%" alt="SceneVerse Teaser">
</div>

We propose SceneVerse, the first million-scale 3D vision-language dataset with 68K 3D indoor scenes and 2.5M vision-language pairs.  We demonstrate the scaling effect by (i) achieving state-of-the-art on all existing 3D visual grounding benchmarks and (ii) showcasing zero-shot transfer capabilities with our GPS (Grounded Pre-training for Scenes) model.

## News
- ![](https://img.shields.io/badge/New!-8A2BE2) [2024-12] Our follow-up work on situated question answering on SceneVerse is out, check it out [here](https://msr3d.github.io/)!
- [2024-10] Pre-trained checkpoints are now available, find detailed instructions in [TRAIN.md](TRAIN.md)!
- [2024-09] The scripts for scene graph generation are released.
- [2024-07] Training & Inference code as well as preprocessing code is released and checkpoints & logs are on the way!
- [2024-07] Preprocessing codes for scenes used in SceneVerse are released.
- [2024-07] SceneVerse is accepted by ECCV 2024! Training and inference codes/checkpoints will come shortly, stay tuned!
- [2024-03] We release the data used in SceneVerse. Fill out the [form](https://forms.gle/AXMk7MH6bFXpCqd99) for the download link!
- [2024-01] We release SceneVerse on ArXiv. Checkout our [paper](https://arxiv.org/abs/2401.09340) and [website](https://scene-verse.github.io/).

## Data
See [DATA.md](DATA.md) for detailed instructions on data download, processing, visualization. The data inventory is listed below:

|   Dataset    | Object Caption | Scene Caption | Ref-Annotation   | Ref-Pairwise<br>```rel2``` | Ref-MultiObject<br>```relm``` | Ref-Star<br>```star``` | Ref-Chain (Optional)<br>```chain``` |
|:------------:|:--------------:|:-------------:|------------------|-------------------------|-------------------------------|-----------------------|------------------------------------|
|   ScanNet    |       ✅        |       ✅       | ScanRefer<br>Nr3D | ✅              | ✅                             | ✅           | ✅       |
|  MultiScan   |       ✅        |       ✅       | ✅ | ✅              | ✅                             | ✅           | ✅       |
| ARKitScenes  |       ✅        |       ✅       | ✅ | ✅              | ✅                             | ✅           | ✅       |
|     HM3D     |  ```template```   |       ✅       | ✅ | ✅              | ✅                             | ✅           | ✅       |
|    3RScan    |       ✅        |       ✅       | ❌ | ✅              | ✅                             | ✅           | ✅       |
| Structured3D | ```template``` |       ✅       | ❌ | ✅              | ✅                             | ✅           |    ❌     |
|   ProcTHOR   | ```template``` |    ❌     | ❌ | ```template```              | ```template```                   | ```template```            |    ❌     |


## Training and Inference
See [TRAIN.md](TRAIN.md) for the inventory of available checkpoints and detailed instructions on training and testing 
with pre-trained checkpoints. The checkpoint inventory is listed below:


| Setting              | Description                                                             | Corresponding Experiment                            | Checkpoint based on experiment setting                                                                                                                                                                                                                                                                           |
|----------------------|-------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ```pre-trained```    | GPS model pre-trained on SceneVerse                                     | 3D-VL grounding (Tab.2)                             | [Model](https://drive.google.com/drive/folders/1FDjVaYZxHdMJgxB8stSHfI34Q7crItJc?usp=sharing)                                                                                                                                                                                                                                                                                                        |
| ```scratch```        | GPS model trained on datasets from scratch                              | 3D-VL grounding (Tab.2)<br/>SceneVerse-val (Tab. 3) | [ScanRefer](https://drive.google.com/drive/folders/1d7sGm_D7kyj6Fmo0f8b6DPrhWYUCtWVq?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1bKGgXot8Sc6BB2MWAfW_OGdu0iq0RWZt?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/14K-UaIeg0GHWFoaonIFHTHZZbDotukzV?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/1CeWwLIPEuK0b35I_gbiwu_OiaUEE42jD?usp=drive_link)                                                                                                                                                                                                                                                            |
| ```fine-tuned```     | GPS model fine-tuned on datasets with grounding heads                   | 3D-VL grounding (Tab.2)                             | [ScanRefer](https://drive.google.com/drive/folders/1P5YprjIlBMAl0OQ38jgTDJyuFVIGiCMS?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1-LMYW6jy5wpqL_KlQQuvuSM7TDyo7M3g?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/1sw-_hhF2__JgGCHE1yfyAQeNZ7jSrID0?usp=sharing)                                                                                                                                                                                                                                                                                |
| ```zero-shot```      | GPS model trained on SceneVerse without data from ScanNet and MultiScan | Zero-shot Transfer (Tab.3)                          | [Model](https://drive.google.com/drive/folders/11824oiZnaU8ChsNpH8zZKIT2i1PdJWSA?usp=sharing)                                                                                                                                                                                                                                                                                                        |
| ```zero-shot text``` | GPS                                                                     | Zero-shot Transfer (Tab.3)                          | [ScanNet](https://drive.google.com/drive/folders/1TKIhb7xgGzwDiAdvznwTpKzkcJnG7GD0?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/18f65Q6313sa-blLCyspqjZRmWpKJPh3M?usp=sharing)                                                                                                                                                                                              |
| ```text-ablation```  | Ablations on the type of language used during pre-training              | Ablation on Text (Tab.7)                            | [Template only](https://drive.google.com/drive/folders/1Xo6FkbThHP3uLUJMblt3zgJiM0n3RbVK?usp=sharing), [Template+LLM](https://drive.google.com/drive/folders/1w9Oi8nWKZXOW3BcA0eiC1bgp7snk8ZKS?usp=sharing)                                                                                                      |
| ```scene-ablation``` | Ablations on the use of synthetic scenes during pre-training            | Ablation on Scene (Tab.8)                           | [Real only](https://drive.google.com/drive/folders/1WZDf2BS7eG36NgGEdTuChICmVHF377is?usp=sharing), [S3D only](https://drive.google.com/drive/folders/1Zh4QfCs6l67ZeltvzOPZtokKkgkvxATc?usp=sharing), [ProcTHOR only](https://drive.google.com/drive/folders/1H9zm7vYxVn_zd2HYi49Js9R34AHnGi1d?usp=sharing)                                                                                                                                                                                                                                                                   |
| ```model-ablation``` | Ablations on the use of losses during pre-training                      | Ablation on Model Design (Tab.9)                    | [Refer only](https://drive.google.com/drive/folders/1yKF8dVPlcbKb-COcfUZbwcqWxt_uvzuc?usp=sharing), [Refer+Obj-lvl](https://drive.google.com/drive/folders/1C5L20UvTQj2my2t0BnqHZPsb_VaXxVjX?usp=sharing), [w/o Scene-lvl](https://drive.google.com/drive/folders/14jR43ils1-jop6K84hu1AqPqU9DcHucx?usp=sharing) |
| ```3d-qa```          | Results for QA fine-tuning on ScanQA and SQA3D                          | 3D-QA Experiments (Tab.5)                           | [ScanQA](https://drive.google.com/drive/folders/1_Qluyeu-gvfyQSRoPNcPg7qss5IxFRwO?usp=sharing), [SQA3D](https://drive.google.com/drive/folders/1DGVqsqP12Y2Un10UAC5u9HLij0NJVzJC?usp=sharing)                                                                                                    |


## BibTex
```bibtex
@inproceedings{jia2024sceneverse,
  title={Sceneverse: Scaling 3d vision-language learning for grounded scene understanding},
  author={Jia, Baoxiong and Chen, Yixin and Yu, Huangyue and Wang, Yan and Niu, Xuesong and Liu, Tengyu and Li, Qing and Huang, Siyuan},
  booktitle={European Conference on Computer Vision (ECCV)},
  year={2024}
}
```

## Acknowledgements
We thank the authors from [ScanRefer](https://github.com/daveredrum/ScanRefer), 
[ScanNet](https://github.com/ScanNet/ScanNet), 
[3RScan](https://github.com/WaldJohannaU/3RScan), [ReferIt3D](https://github.com/referit3d/referit3d), 
[Structured3D](https://github.com/bertjiazheng/Structured3D), 
[HM3D](https://github.com/matterport/habitat-matterport-3dresearch),
[ProcTHOR](https://github.com/allenai/procthor),
[ARKitScenes](https://github.com/apple/ARKitScenes), [MultiScan](https://github.com/smartscenes/multiscan) for
open-sourcing their awesome datasets. We also heavily adapted codes from [ScanQA](https://github.com/ATR-DBI/ScanQA), 
[SQA3D](https://github.com/SilongYong/SQA3D), and 
[3D-VisTA](https://github.com/3d-vista/3D-VisTA) for training and inference.


================================================
FILE: TRAIN.md
================================================
# Training and Inference

## Environment Setup
To install the environment requirements needed for SceneVerse, you can run the installation scripts provided by:
```bash
$ conda env create -n sceneverse python=3.9
$ conda activate sceneverse
$ pip install --r requirements.txt
```
Meanwhile, SceneVerse depends on an efficient implementation of PointNet2  which is located in ```modules```. Remember to install it with
```bash
$ cd modules/third_party/pointnet2
$ python setup.py install
$ cd ../..
```

## Model Configurations
### 1. Experiment Setup
We provide all experiment configurations in ```configs/final```, you can find the experiment setting in the top of comment
each experiment file. To correctly use the configuration files, you need to change the following fields in the configuration
file to load paths correctly: 
- ```base_dir```: save path for model checkpoints, configurations, and logs.
- ```logger.entity```: we used W&B for logging experiments, change it to your corresponding account.
- ```data.{DATASET}_familiy_base```: path to ```{Dataset}``` related data.
- ```model.vision.args.path```: path to the pre-trained object encoder (PointNet++).
- ```model.vision.args.lang_path```: deprecated, but basically text embeddings of the 607 classes in ScanNet.

You can walk through the ```configs/final/all_pretrain.yaml``` and compare it with other files to see how we controlled
data and objectives used in training.

## Experiments
### 1. Training and Inference
This codebase leverages [Huggingface Accelerate](https://huggingface.co/docs/accelerate/index) package and 
[Facebook Submitit](https://github.com/facebookincubator/submitit) package for efficient model training on multi-node clusters.
We provide a launcher file ```launch.py``` which provides three ways of launching experiment:
```bash
# Launching using submitit on a SLURM cluster (e.g. 10 hour 1 node 4 GPU experiment with config file $CONFIG)
$ python launch.py --mode submitit --time 10 --qos $QOS --partition $PARTITION --mem_per_gpu 80 \
                   --gpu_per_node 4 --config $CONFIG note=$NOTE name=$EXP_NAME
                   
# Launching using accelerator with a multi-gpu instance
$ python launch.py --mode accelerate --gpu_per_node 4 --num_nodes 1 -- config $CONFIG note=$NOTE name=$EXP_NAME 
```
Basically, ```launch.py``` set up process(es) to run the main entry point ```run.py``` under multi GPU settings. You can
directly overwrite configurations in the configuration file ```$CONFIG``` by setting property fields using ```=``` after
all command line arguments. (e.g., ```name=$EXP_NAME```,```solver.epochs=400```,```dataloader.batchsize=4```)

For testing and inference, remember to set up the testing data correctly under each configuration files and switch the
```mode``` field in the configurations into ```test``` (i.e., ```mode=test```).

### 2. Debugging
If you want to debug your code without an additional job launcher, you can also directly run the file ```run.py``` . 
As an example, you can directly run the file for debugging with
```bash
# Single card direct run for debugging purposes
$ python run.py --config-path ${PROJ_PATH}/configs/final/ --config-name ${EXP_CONFIG_NAME}.yaml \
                num_gpu=1 hydra.run.dir=. hydra.output_subdir=null hydra/job_logging=disabled hydra/hydra_logging=disabled \
                debug.flag=True debug.debug_size=1 dataloader.batchsize=2 debug.hard_debug=True name=Debug_test
```

## Checkpoints
We provide all available checkpoints under the same data directory, named after ```Checkpoints```. Here we provide detailed
descriptions of checkpoint in the table below:

| Setting              | Description                                                             | Corresponding Experiment                            | Checkpoint based on experiment setting                                                                                                                                                                                                                                                                                                                                                                   |
|----------------------|-------------------------------------------------------------------------|-----------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| ```pre-trained```    | GPS model pre-trained on SceneVerse                                     | 3D-VL grounding (Tab.2)                             | [Model](https://drive.google.com/drive/folders/1FDjVaYZxHdMJgxB8stSHfI34Q7crItJc?usp=sharing)                                                                                                                                                                                                                                                                                                            |
| ```scratch```        | GPS model trained on datasets from scratch                              | 3D-VL grounding (Tab.2)<br/>SceneVerse-val (Tab. 3) | [ScanRefer](https://drive.google.com/drive/folders/1d7sGm_D7kyj6Fmo0f8b6DPrhWYUCtWVq?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1bKGgXot8Sc6BB2MWAfW_OGdu0iq0RWZt?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/14K-UaIeg0GHWFoaonIFHTHZZbDotukzV?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/1CeWwLIPEuK0b35I_gbiwu_OiaUEE42jD?usp=drive_link) |
| ```fine-tuned```     | GPS model fine-tuned on datasets with grounding heads                   | 3D-VL grounding (Tab.2)                             | [ScanRefer](https://drive.google.com/drive/folders/1P5YprjIlBMAl0OQ38jgTDJyuFVIGiCMS?usp=sharing), [Sr3D](https://drive.google.com/drive/folders/1-LMYW6jy5wpqL_KlQQuvuSM7TDyo7M3g?usp=sharing), [Nr3D](https://drive.google.com/drive/folders/1sw-_hhF2__JgGCHE1yfyAQeNZ7jSrID0?usp=sharing)                                                                                                            |
| ```zero-shot```      | GPS model trained on SceneVerse without data from ScanNet and MultiScan | Zero-shot Transfer (Tab.3)                          | [Model](https://drive.google.com/drive/folders/11824oiZnaU8ChsNpH8zZKIT2i1PdJWSA?usp=sharing)                                                                                                                                                                                                                                                                                                            |
| ```zero-shot text``` | GPS                                                                     | Zero-shot Transfer (Tab.3)                          | [ScanNet](https://drive.google.com/drive/folders/1TKIhb7xgGzwDiAdvznwTpKzkcJnG7GD0?usp=sharing), [SceneVerse-val](https://drive.google.com/drive/folders/18f65Q6313sa-blLCyspqjZRmWpKJPh3M?usp=sharing)                                                                                                                                                                                                  |
| ```text-ablation```  | Ablations on the type of language used during pre-training              | Ablation on Text (Tab.7)                            | [Template only](https://drive.google.com/drive/folders/1Xo6FkbThHP3uLUJMblt3zgJiM0n3RbVK?usp=sharing), [Template+LLM](https://drive.google.com/drive/folders/1w9Oi8nWKZXOW3BcA0eiC1bgp7snk8ZKS?usp=sharing)                                                                                                                                                                                              |
| ```scene-ablation``` | Ablations on the use of synthetic scenes during pre-training            | Ablation on Scene (Tab.8)                           | [Real only](https://drive.google.com/drive/folders/1WZDf2BS7eG36NgGEdTuChICmVHF377is?usp=sharing), [S3D only](https://drive.google.com/drive/folders/1Zh4QfCs6l67ZeltvzOPZtokKkgkvxATc?usp=sharing), [ProcTHOR only](https://drive.google.com/drive/folders/1H9zm7vYxVn_zd2HYi49Js9R34AHnGi1d?usp=sharing)                                                                                               |
| ```model-ablation``` | Ablations on the use of losses during pre-training                      | Ablation on Model Design (Tab.9)                    | [Refer only](https://drive.google.com/drive/folders/1yKF8dVPlcbKb-COcfUZbwcqWxt_uvzuc?usp=sharing), [Refer+Obj-lvl](https://drive.google.com/drive/folders/1C5L20UvTQj2my2t0BnqHZPsb_VaXxVjX?usp=sharing), [w/o Scene-lvl](https://drive.google.com/drive/folders/14jR43ils1-jop6K84hu1AqPqU9DcHucx?usp=sharing)                                                                                         |
| ```3d-qa```          | Results for QA fine-tuning on ScanQA and SQA3D                          | 3D-QA Experiments (Tab.5)                           | [ScanQA](https://drive.google.com/drive/folders/1_Qluyeu-gvfyQSRoPNcPg7qss5IxFRwO?usp=sharing), [SQA3D](https://drive.google.com/drive/folders/1DGVqsqP12Y2Un10UAC5u9HLij0NJVzJC?usp=sharing)                                                                                                    |


To properly use the pre-trained checkpoints, you can use the ```pretrain_ckpt_path``` key in the configs:
```shell
# Directly testing the checkpoint
$ python launch.py --mode submitit --qos $QOS --partition $PARTITION --mem_per_gpu 80 \
                   --gpu_per_node 4 --config $CONFIG note=$NOTE name=$EXP_NAME mode=test \
                   pretrain_ckpt_path=$PRETRAIN_CKPT

# Fine-tuning with pre-trained checkpoint
$ python launch.py --mode submitit --qos $QOS --partition $PARTITION --mem_per_gpu 80 \
                   --gpu_per_node 4 --config $CONFIG note=$NOTE name=$EXP_NAME \
                   pretrain_ckpt_path=$PRETRAIN_CKPT
```
For fine-tuning the pre-trained checkpoint on datasets, you can use the fine-tuning config files provided under 
```configs/final/finetune```.

================================================
FILE: common/box_utils.py
================================================
import numpy as np


def box3d_iou(corners1, corners2):
    ''' Compute 3D bounding box IoU.

    Input:
        corners1: numpy array (8,3), assume up direction is Z
        corners2: numpy array (8,3), assume up direction is Z
    Output:
        iou: 3D bounding box IoU

    '''
    x_min_1, x_max_1, y_min_1, y_max_1, z_min_1, z_max_1 = get_box3d_min_max(corners1)
    x_min_2, x_max_2, y_min_2, y_max_2, z_min_2, z_max_2 = get_box3d_min_max(corners2)
    xA = np.maximum(x_min_1, x_min_2)
    yA = np.maximum(y_min_1, y_min_2)
    zA = np.maximum(z_min_1, z_min_2)
    xB = np.minimum(x_max_1, x_max_2)
    yB = np.minimum(y_max_1, y_max_2)
    zB = np.minimum(z_max_1, z_max_2)
    inter_vol = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0) * np.maximum((zB - zA), 0)
    box_vol_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1) * (z_max_1 - z_min_1)
    box_vol_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2) * (z_max_2 - z_min_2)
    iou = inter_vol / (box_vol_1 + box_vol_2 - inter_vol + 1e-8)

    return iou


def get_box3d_min_max(corner):
    ''' Compute min and max coordinates for 3D bounding box
        Note: only for axis-aligned bounding boxes

    Input:
        corners: numpy array (8,3), assume up direction is Z (batch of N samples)
    Output:
        box_min_max: an array for min and max coordinates of 3D bounding box IoU

    '''
    min_coord = corner.min(axis=0)
    max_coord = corner.max(axis=0)
    x_min, x_max = min_coord[0], max_coord[0]
    y_min, y_max = min_coord[1], max_coord[1]
    z_min, z_max = min_coord[2], max_coord[2]
    
    return x_min, x_max, y_min, y_max, z_min, z_max


def get_3d_box(center, box_size):
    ''' box_size is array(l,w,h), heading_angle is radius clockwise from pos x axis, center is xyz of box center
        output (8,3) array for 3D box cornders
        Similar to utils/compute_orientation_3d
    '''
    l,w,h = box_size
    # x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2]
    # y_corners = [h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2]
    # z_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2]
    x_corners = [l/2,l/2,-l/2,-l/2,l/2,l/2,-l/2,-l/2]
    y_corners = [w/2,-w/2,-w/2,w/2,w/2,-w/2,-w/2,w/2]
    z_corners = [h/2,h/2,h/2,h/2,-h/2,-h/2,-h/2,-h/2]
    corners_3d = np.vstack([x_corners,y_corners,z_corners])
    corners_3d[0,:] = corners_3d[0,:] + center[0]
    corners_3d[1,:] = corners_3d[1,:] + center[1]
    corners_3d[2,:] = corners_3d[2,:] + center[2]
    corners_3d = np.transpose(corners_3d)
    return corners_3d

================================================
FILE: common/dist_utils.py
================================================
import functools
import pickle
import torch
import torch.distributed as dist

import logging
logger = logging.getLogger(__name__)

########################### Basic utility for distributed info ################################

def is_dist_avail_and_initialized():
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True


def get_rank():
    """
    Get the rank of the current process.
    """
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()


def get_world_size():
    """
    Get the size of the world.
    """
    if not is_dist_avail_and_initialized():
        return 1
    return dist.get_world_size()


def is_master_proc(num_gpus=8):
    """
    Determines if the current process is the master process on each node.
    """
    if is_dist_avail_and_initialized():
            return dist.get_rank() % num_gpus == 0
    else:
        return True


def is_root_proc():
    """
    Determines if the current process is the root process.
    """
    if is_dist_avail_and_initialized():
        return dist.get_rank() == 0
    else:
        return True


############################## Data gathering across devices ##################################

def _serialize_to_tensor(data, group, max_size=1024):
    """
    Serialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
        backend is supported.
    Args:
        data (data): data to be serialized.
        group (group): pytorch dist group.
    Returns:
        tensor (ByteTensor): tensor that serialized.
    """
    backend = dist.get_backend(group)
    assert backend in ["gloo", "nccl"]
    device = torch.device("cpu" if backend == "gloo" else "cuda")

    buffer = pickle.dumps(data)
    if len(buffer) > max_size ** 3:
        logger.warning(
            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
                get_rank(), len(buffer) / (max_size ** 3), device
            )
        )
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor


def _pad_to_largest_tensor(tensor, group):
    """
    Padding all the tensors from different GPUs to the largest ones.
    Args:
        tensor (tensor): tensor to pad.
        group (group): pytorch dist group.
    Returns:
        list[int]: size of the tensor, on each rank
        Tensor: padded tensor that has the max size
    """
    world_size = dist.get_world_size(group=group)
    assert (
        world_size >= 1
    ), "comm.gather/all_gather must be called from ranks within the given group!"
    local_size = torch.tensor(
        [tensor.numel()], dtype=torch.int64, device=tensor.device
    )
    size_list = [
        torch.zeros([1], dtype=torch.int64, device=tensor.device)
        for _ in range(world_size)
    ]
    dist.all_gather(size_list, local_size, group=group)
    size_list = [int(size.item()) for size in size_list]

    max_size = max(size_list)

    # we pad the tensor because torch all_gather does not support
    # gathering tensors of different shapes
    if local_size != max_size:
        padding = torch.zeros(
            (max_size - local_size,), dtype=torch.uint8, device=tensor.device
        )
        tensor = torch.cat((tensor, padding), dim=0)
    return size_list, tensor


def broadcast(object):
    if isinstance(object, torch.Tensor):
        dist.broadcast(tensor=object, src=0)
    else:
        sync_tensor = torch.Tensor([object]).cuda()
        dist.broadcast(tensor=sync_tensor, src=0)
        object = sync_tensor[0].item()
    return object


def all_gather(tensors):
    """
    All gathers the provided tensors from all processes across machines.
    Args:
        tensors (list): tensors to perform all gather across all processes in
        all machines.
    """
    gather_list = []
    output_tensor = []
    world_size = dist.get_world_size()
    for tensor in tensors:
        tensor_placeholder = [
            torch.ones_like(tensor) for _ in range(world_size)
        ]
        dist.all_gather(tensor_placeholder, tensor, async_op=False)
        gather_list.append(tensor_placeholder)
    for gathered_tensor in gather_list:
        output_tensor.append(torch.cat(gathered_tensor, dim=0))
    return output_tensor


def all_reduce(tensors, average=True):
    """
    All reduce the provided tensors from all processes across machines.
    Args:
        tensors (list): tensors to perform all reduce across all processes in
        all machines.
        average (bool): scales the reduced tensor by the number of overall
        processes across all machines.
    """
    for tensor in tensors:
        dist.all_reduce(tensor, async_op=False)
    if average:
        world_size = dist.get_world_size()
        for tensor in tensors:
            tensor.mul_(1.0 / world_size)
    return tensors


@functools.lru_cache()
def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    Returns:
        (group): pytorch dist group.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD


def all_gather_unaligned(data, group=None):
    """
    Run all_gather on arbitrary picklable data (not necessarily tensors).

    Args:
        data: any picklable object
        group: a torch process group. By default, will use a group which
            contains all ranks on gloo backend.

    Returns:
        list[data]: list of data gathered from each rank
    """
    if get_world_size() == 1:
        return [data]
    if group is None:
        group = _get_global_gloo_group()
    if dist.get_world_size(group) == 1:
        return [data]

    tensor = _serialize_to_tensor(data, group)

    size_list, tensor = _pad_to_largest_tensor(tensor, group)
    max_size = max(size_list)

    # receiving Tensor from all ranks
    tensor_list = [
        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
        for _ in size_list
    ]
    dist.all_gather(tensor_list, tensor, group=group)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list

================================================
FILE: common/io_utils.py
================================================
import csv
import pickle
import json
import cv2
import yaml
import numpy as np
from pathlib import Path
import torch
import open3d
from plyfile import PlyData

def make_dir(dir_path):
    if not Path(dir_path).exists():
        Path(dir_path).mkdir(parents=True, exist_ok=True)


def load_imgs(img_paths, option=cv2.IMREAD_COLOR):
    imgs = [cv2.imread(img_path, option) for img_path in img_paths]
    return imgs


def load_pickle(filename):
    with Path(filename).open("rb") as f:
        return pickle.load(f)


def save_pickle(data, filename):
    with Path(filename).open("wb") as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)


def load_json(filename):
    with Path(filename).open("rb") as f:
        return json.load(f)


def save_json(data, filename, save_pretty=True, sort_keys=False):
    with Path(filename).open("w") as f:
        if save_pretty:
            f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
        else:
            json.dump(data, f)


def load_jsonl(filename):
    with Path(filename).open("r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]


def save_jsonl(data, filename):
    with Path(filename).open("w") as f:
        f.write("\n".join([json.dumps(e) for e in data]))


def load_yaml(filename):
    with Path(filename).open("r") as f:
        return yaml.load(f, Loader=yaml.SafeLoader)


def save_yaml(data, filename):
    with Path(filename).open("w") as f:
        yaml.dump(data, f, default_flow_style=False)


def load_csv(filename, delimiter=","):
    idx2key = None
    contents = {}
    with Path(filename).open("r") as f:
        reader = csv.reader(f, delimiter=delimiter)
        for l_idx, row in reader:
            if l_idx == 0:
                idx2key = row
                for k_idx, key in enumerate(idx2key):
                    contents[key] = []
            else:
                for c_idx, col in enumerate(row):
                    contents[idx2key[c_idx]].append(col)
    return contents, idx2key


def save_csv(data, filename, cols=None, delimiter=","):
    with Path(filename).open("w") as f:
        writer = csv.writer(f, delimiter=delimiter)
        num_entries = len(data[list(data.keys())[0]])
        assert cols is not None, "Must have column names for dumping csv files."
        writer.writerow(cols)
        for l_idx in range(num_entries):
            row = [data[key][l_idx] for key in cols]
            writer.writerow(row)


def load_numpy(filename):
    return np.load(filename, allow_pickle=True)


def save_numpy(data, filename):
    np.save(filename, data, allow_pickle=True)


def load_tensor(filename):
    return torch.load(filename)


def save_tensor(data, filename):
    torch.save(data, filename)


def load_ply(filepath):
    with open(filepath, "rb") as f:
        plydata = PlyData.read(f)
    data = plydata.elements[0].data
    coords = np.array([data["x"], data["y"], data["z"]], dtype=np.float32).T
    feats = None
    labels = None
    if ({"red", "green", "blue"} - set(data.dtype.names)) == set():
        feats = np.array([data["red"], data["green"], data["blue"]], dtype=np.uint8).T
    if "label" in data.dtype.names:
        labels = np.array(data["label"], dtype=np.uint32)
    return coords, feats, labels

    
def load_ply_with_normals(filepath):
    mesh = open3d.io.read_triangle_mesh(str(filepath))
    if not mesh.has_vertex_normals():
        mesh.compute_vertex_normals()
    vertices = np.asarray(mesh.vertices)
    normals = np.asarray(mesh.vertex_normals)

    coords, feats, labels = load_ply(filepath)
    assert np.allclose(coords, vertices), "different coordinates"
    feats = np.hstack((feats, normals))

    return coords, feats, labels


================================================
FILE: common/launch_utils.py
================================================
import os
from pathlib import Path
import subprocess

import submitit


huggingface_fix = f"TRANSFORMERS_OFFLINE=1 CURL_CA_BUNDLE=''"


class SubmititLauncher:
    def __init__(self, args):
        self.args = args

    def __call__(self):
        host_name = os.popen(
            "scontrol show hostnames $SLURM_JOB_NODELIST"
        ).read().split("\n")[0]
        self._set_gpu_args()
        # Using Accelerate for launching
        multi_gpu = "--multi_gpu" if self.args.num_nodes * self.args.gpu_per_node > 1 else ""
        opts = " ".join(self.args.opts) if len(self.args.opts) > 0 else ""
        opts += f" num_gpu={self.args.num_nodes * self.args.gpu_per_node} "
        full_cfg_path = Path(self.args.config)
        cfg_path, cfg_file = str(full_cfg_path.parent), str(full_cfg_path.name)
        cmd = f"{huggingface_fix} accelerate launch --num_machines {self.args.num_nodes} \
                        --mixed_precision {self.args.mixed_precision} {multi_gpu} \
                        --num_processes {self.args.gpu_per_node * self.args.num_nodes} \
                        --num_cpu_threads_per_process {self.args.cpu_per_task} \
                        --main_process_ip {host_name} \
                        --main_process_port {self.args.port} \
                        --machine_rank {self.args.node_id} \
                        --dynamo_backend no \
                        {self.args.run_file} \
                        --config-path {cfg_path} \
                        --config-name {cfg_file} \
                        num_gpu={self.args.num_nodes * self.args.gpu_per_node} \
                        hydra.run.dir=. \
                        hydra.output_subdir=null \
                        hydra/job_logging=disabled \
                        hydra/hydra_logging=disabled {opts}"
        subprocess.run(cmd, shell=True)
    
    def _set_gpu_args(self):
        job_env = submitit.JobEnvironment()
        self.args.job_dir = str(self.args.job_dir).replace("%j", job_env.job_id)
        self.args.node_id = int(job_env.global_rank / self.args.gpu_per_node)


def submitit_launch(args):
    """
    Multi node script launching with Submitit
    """
    additional_parameters = {}
    if args.nodelist != "":
        # if specifying node id
        nodelist = f"{str(args.nodelist)}"
        additional_parameters["nodelist"] = nodelist

    executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
    executor.update_parameters(
        name=args.name,
        mem_gb=args.mem_per_gpu * args.gpu_per_node * args.num_nodes,
        gpus_per_node=args.gpu_per_node,
        tasks_per_node=1,
        cpus_per_task=args.gpu_per_node * args.cpu_per_task,
        nodes=args.num_nodes,
        slurm_qos=args.qos,
        slurm_partition=args.partition,
        slurm_account=args.account,
        slurm_time=args.time * 60,
        slurm_signal_delay_s=120,
        slurm_additional_parameters=additional_parameters
    )
    launcher = SubmititLauncher(args)
    job = executor.submit(launcher)
    print(f"submitted job: {job.job_id}")


def accelerate_launch(args):
    """
    Single node script launching with Accelerate
    """
    opts = " ".join(args.opts) if len(args.opts) > 0 else ""
    opts += f" num_gpu={args.num_nodes * args.gpu_per_node} "
    multi_gpu = "--multi_gpu" if args.num_nodes * args.gpu_per_node > 1 else ""
    full_cfg_path = Path(args.config)
    cfg_path, cfg_file = str(full_cfg_path.parent), str(full_cfg_path.name)
    cmd = f"{huggingface_fix} accelerate launch --num_machines {args.num_nodes} \
        {multi_gpu} \
        --mixed_precision {args.mixed_precision} \
        --num_processes {args.gpu_per_node * args.num_nodes} \
        --num_cpu_threads_per_process {args.cpu_per_task} \
        --dynamo_backend no \
        {args.run_file} \
        --config-path {cfg_path} \
        --config-name {cfg_file} \
        num_gpu={args.num_nodes * args.gpu_per_node} \
        hydra.run.dir=. \
        hydra.output_subdir=null \
        hydra/job_logging=disabled \
        hydra/hydra_logging=disabled {opts}"
    subprocess.run(cmd, shell=True)


def python_launch(args):
    """
    Vanilla python launcher for degbugging purposes
    """
    opts = " ".join(args.opts) if len(args.opts) > 0 else ""
    full_cfg_path = Path(args.config)
    cfg_path, cfg_file = str(full_cfg_path.parent), str(full_cfg_path.name)
    cmd = f"{huggingface_fix} python {args.run_file} " \
          f"--config-path {cfg_path} " \
          f"--config-name {cfg_file} " \
          f"num_gpu=1 " \
          f"hydra.run.dir=. " \
          f"hydra.output_subdir=null " \
          f"hydra/job_logging=disabled " \
          f"hydra/hydra_logging=disabled {opts}"
    subprocess.run(cmd, shell=True)

================================================
FILE: common/misc.py
================================================
import os
import glob
import importlib
import functools
import torch
from typing import Any
from accelerate.logging import get_logger
from accelerate.state import PartialState
from accelerate.utils import recursively_apply
from accelerate.utils.constants import TORCH_DISTRIBUTED_OPERATION_TYPES
from accelerate.utils.dataclasses import DistributedType

logger = get_logger(__name__)


def rsetattr(obj, attr, val):
    pre, _, post = attr.rpartition('.')
    return setattr(rgetattr(obj, pre) if pre else obj, post, val)

# using wonder's beautiful simplification: https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427

def rgetattr(obj, attr, *args):
    def _getattr(obj, attr):
        return getattr(obj, attr, *args)
    return functools.reduce(_getattr, [obj] + attr.split('.'))


# def import_all(exclude_list=None):
#     if exclude_list is None:
#         exclude_list = ["__init__.py", "build.py"]
#     print(f"file: {__file__}")
#     current_directory = os.path.dirname(__file__)
#     module_names = [
#         os.path.splitext(file)[0] for file in os.listdir(current_directory)
#         if file.endswith(".py") and file not in exclude_list
#     ]
#     for module_name in module_names:
#         module = importlib.import_module(f".{module_name}", package=__name__)
#         globals().update({name: getattr(module, name) for name in getattr(module, '__all__', [])})
#     __all__ = [name for name in globals() if not name.startswith("_")]


def _gpu_gather_object(object: Any):
    # by JY Huang: re-implement the method for gathering non-tensor objects
    output_objects = [None for _ in range(PartialState().num_processes)]
    torch.distributed.all_gather_object(output_objects, object)
    if isinstance(object, (list, tuple)):
        output_list = []
        for item in output_objects:
            output_list.extend(item)
        return output_list
    elif isinstance(object, dict):
        template = output_objects[0]
        output_dict = {}
        for k, v in template.items():
            output_dict[k] = []
            for item in output_objects:
                if isinstance(item[k], list):
                    output_dict[k].extend(item[k])
                else:
                    output_dict[k].append(item[k])
        return output_dict


def gather_object(object: Any):
    """
    Recursively gather object in a nested list/tuple/dictionary of objects from all devices.

    Args:
        object (nested list/tuple/dictionary of picklable object):
            The data to gather.

    Returns:
        The same data structure as `object` with all the objects sent to every device.
    """
    if PartialState().distributed_type == DistributedType.TPU:
        raise NotImplementedError("gather objects in TPU is not supported")
    elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
        return _gpu_gather_object(object)
    else:
        return object


def gather_for_metrics(accelerator, input_data):
    """
    by JY Huang: re-implement this method for gathering non-tensor objects
    Refer source code to https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.gather_for_metrics
    """

    try:
        recursively_apply(lambda x: x, input_data, error_on_other_type=True)
        all_tensors = True
    except TypeError:
        all_tensors = False

    if not all_tensors:
        data = gather_object(input_data)
    else:
        data = accelerator.gather(input_data)

    try:
        if accelerator.gradient_state.end_of_dataloader:
            # at the end of a dataloader, `gather_for_metrics` regresses to
            # `gather` unless the dataset has a remainder so log.
            if accelerator.gradient_state.remainder == -1:
                logger.info(
                    "The used dataset had no length, returning gathered tensors. You should drop the remainder yourself."
                )
                return data
            elif accelerator.gradient_state.remainder > 0:
                # Last batch needs to be truncated on distributed systems as it contains additional samples
                def _adjust_samples(tensor):
                    return tensor[: accelerator.gradient_state.remainder] if tensor is not None else None
                if all_tensors:
                    # This only applies to tensors, as defined in `recursively_apply`
                    return recursively_apply(_adjust_samples, data)
                else:
                    if isinstance(data, (list, tuple)):
                        return _adjust_samples(data)
                    elif isinstance(data, dict):
                        return {k: _adjust_samples(v) for k, v in data.items()}
                    else:
                        raise NotImplementedError(f"Non-tensor gather only supports list, tuple or dict")
            else:  # remainder is 0
                # no remainder even though at end of dataloader, so nothing to do.
                return data
        else:
            # Not at the end of the dataloader, no need to adjust the tensors
            return data
    except Exception:
        # Dataset had no length or raised an error
        return data
    
def gather_dict(accelerator, data_dict):
    data_dict_non_tensor = {k : v for k, v in data_dict.items() if not isinstance(v, torch.Tensor)}
    data_dict_non_tensor = gather_for_metrics(accelerator, data_dict_non_tensor)
    data_dict = {k : v for k, v in data_dict.items() if isinstance(v, torch.Tensor)}
    data_dict = gather_for_metrics(accelerator, data_dict)
    data_dict.update(data_dict_non_tensor)
    return data_dict


================================================
FILE: common/type_utils.py
================================================
import torch

from omegaconf import OmegaConf


def cfg2dict(cfg):
    return OmegaConf.to_container(cfg, resolve=True)


def _to_device(state, device):
    """ usually load from cpu checkpoint but need to load to cuda """
    if isinstance(state, torch.Tensor):
        new_state = state.to(device, non_blocking=True)  # assume propoerly set py torch.cuda.set_device
    elif isinstance(state, list):
        new_state = torch.tensor([_to_device(t, device) for t in state]).to(device)
    elif isinstance(state, tuple):
        new_state = torch.tensor(tuple(_to_device(t, device) for t in state)).to(device)
    elif isinstance(state, dict):
        new_state = {n: _to_device(t, device) for n, t in state.items()}
    else:
        try:
            if not isinstance(state, str):
                new_state = torch.tensor(state).to(device)
            else:
                new_state = state
        except:
            raise ValueError(f"The provided tensor can not be transfered to {device}")
    return new_state

================================================
FILE: configs/final/all_anno.yaml
================================================
###
# Pretrain with human annotation only
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all_anno"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d' ]
      referit3d:
        anno_type: ['nr3d']
        sr3d_plus_aug: False
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt', 'template']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno']
    val:
      sources: [ 'anno']
    test:
      sources: [ 'anno']
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno']
    val:
      sources: [ 'anno' ]
    test:
      sources: [ 'anno' ]
  HMSpatialRefer:
    train:
      sources: [ 'anno' ]
    val:
      sources: [ 'anno' ]
    test:
      sources: [ 'anno' ]
  use_voxel: False
  scan_family_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/ScanNet"
  rscan_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/3RScan"
  arkitscene_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ARKitScenes'
  multiscan_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/MultiScan'
  hm_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/HM3D'
  procthor_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ProcThor'
  s3d_base: /scratch2/generalvision/chenyixin/datasets/SceneVerse/Structured3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_nomlm.yaml
================================================
###
# Pretrain on all data without MLM loss
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all_nomlm"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
#      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
#      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
#      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
#      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_noobj.yaml
================================================
###
# Pretrain on all data without object-level alignment
###


# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: ['rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: False
        path: ''
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
#      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
#      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
#      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
#      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_noscene.yaml
================================================
###
# Pretrain on all data without scene-level alignment
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all_noscene"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno','rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
#      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
#      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain.yaml
================================================
###
# Pretrain on all data with all losses
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt', 'template']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/ScanNet"
  rscan_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/3RScan"
  arkitscene_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ARKitScenes'
  multiscan_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/MultiScan'
  hm_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/HM3D'
  procthor_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ProcThor'
  s3d_base: /scratch2/generalvision/chenyixin/datasets/SceneVerse/Structured3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_125.yaml
================================================
###
# Pretrain on 12.5% of all data
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all0.125"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    subset_ratio: 0.125
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_25.yaml
================================================
###
# Pretrain on 25% of all data
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all0.25"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    subset_ratio: 0.25
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_50.yaml
================================================
###
# Pretrain on 50% of all data
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all0.50"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    subset_ratio: 0.5
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_75.yaml
================================================
###
# Pretrain on 75% all data
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all.75"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    subset_ratio: 0.75
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_objcap.yaml
================================================
###
# Pretrain on all data adding all object captions
###

# Experiment general info
name: "Debug"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','RScanSpatialRefer','HMSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt', 'template']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt','obj_caption_template']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template']
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template']
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt','obj_caption_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template']
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt','obj_caption_template']
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt','obj_caption_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt','obj_caption_template']
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt','obj_caption_template']
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt', 'obj_caption_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template']
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template']
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_objcap_notemplate.yaml
================================================
###
# Pretrain on all data without template-based object captions
###

# Experiment general info
name: "OV_w_Cap"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','RScanSpatialRefer','HMSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt']
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt']
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt']
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt','obj_caption_gpt']
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt','obj_caption_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt']
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_gpt']
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt', 'obj_caption_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template']
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template','obj_caption_template']
  use_voxel: False
  scan_family_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/ScanNet"
  rscan_base: "/scratch2/generalvision/chenyixin/datasets/SceneVerse/3RScan"
  arkitscene_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ARKitScenes'
  multiscan_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/MultiScan'
  hm_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/HM3D'
  procthor_base: '/scratch2/generalvision/chenyixin/datasets/SceneVerse/ProcThor'
  s3d_base: /scratch2/generalvision/chenyixin/datasets/SceneVerse/Structured3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 200
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_s3d.yaml
================================================
###
# Pretrain on all data with Structured 3D
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer','S3DSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  S3DSpatialRefer:
    train:
      sources: [ 'rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt' ]
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'
  s3d_base: '/scratch/masaccio/existing_datasets/Structured3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 250
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 1000
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/results/ALLObjPretrain_b512_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj+S3DPretrainObj_1113scannetws3d/2023-11-14-09:29:10.796592/ckpt/best.pth'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_pretrain_unfreeze.yaml
================================================
###
# Pretrain on all data with object encoder unfrozen
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 250
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 100
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: False
        path: '/scratch/masaccio/results/ALLObjPretrain_b64_Pretrain_ScanNetPretrainObj+RScanPretrainObj+ARKitScenePretrainObj+MultiScanPretrainObj+HMPretrainObj_1113real_all/2023-11-13-12:17:35.068482/ckpt/best.pth'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_rewrite.yaml
================================================
###
# Pretrain on all LLM-refined data only
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "all_rewrite"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: ['scanrefer','sgrefer','sgcaption']
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: ['rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_template.yaml
================================================
###
# Pretrain on all template-based generated data only
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "template_only"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: ['scanrefer', 'sgrefer']
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: ['rel2_template', 'relm_template', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','relm_template','star_template']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['rel2_template','relm_template','star_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['rel2_template','relm_template','star_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['rel2_template','relm_template','star_template']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_wo_both.yaml
================================================
###
# Pretrain on all data without ScanNet and MultiScan data
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "wo_scannet_multiscan"
  train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_wo_both_125.yaml
================================================
###
# Pretrain on 12.5% of all data without ScanNet and MultiScan
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "wo_scannet_multiscan.125"
  train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    subset_ratio: 0.125
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_wo_both_25.yaml
================================================
###
# Pretrain on 25% of all data without ScanNet and MultiScan
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "wo_scannet_multiscan.25"
  train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    subset_ratio: 0.25
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_wo_both_50.yaml
================================================
###
# Pretrain on 50% of all data without ScanNet and MultiScan
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "wo_scannet_multiscan.50"
  train: ['ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    subset_ratio: 0.50
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_wo_multiscan.yaml
================================================
###
# Pretrain on all data without MultiScan
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "wo_multiscan"
  train: ['ScanNetSpatialRefer','ARKitSceneSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['MultiScanSpatialRefer']
  test: ['MultiScanSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'gt'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: ['scanrefer','referit3d','sgrefer','sgcaption']
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [ 'rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno' ]
    test:
      sources: [ 'anno' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ReferIt3DEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]

================================================
FILE: configs/final/all_wo_scannet.yaml
================================================
###
# Pretrain on all data without ScanNet
###

# Experiment general info
name: "FinalOVPretrain"
rng_seed: 42
num_gpu: 8
mode: "train"
note: ""
# Choose keywords to feature your saving directory
naming_keywords: ["dataloader.batchsize", "task", "note", "time"]
base_dir: "/scratch/masaccio/results"
exp_dir: ""
save_frequency: 10

resume: False

debug:
  flag: False
  debug_size: 20
  hard_debug: False

logger:
  name: "wandb"
  entity: "bigai-gvl"

# dataset details
data:
  note: "wo_scannet"
  train: ['ARKitSceneSpatialRefer','MultiScanSpatialRefer','HMSpatialRefer','RScanSpatialRefer']
  val: ['ScanNetSpatialRefer']
  test: ['ScanNetSpatialRefer']
  args:
    max_obj_len: 80
    max_seq_len: 50
    num_points: 1024
    pc_type: 'pred'
    sem_type: '607'
    filter_lang: False
    txt_mask_ratio: 0.15
    pc_mask_ratio: 0.1
    rot_aug: True
    mask_strategy: random
    use_scene_cap: True
    max_scene_cap_len: 300
  ScanNetSpatialRefer:
    train:
      sources: [ 'scanrefer', 'referit3d', 'sgrefer', 'sgcaption' ]
      referit3d:
        anno_type: ['sr3d', 'nr3d']
        sr3d_plus_aug: True
      sgrefer:
        anno_type: [''rel2_gpt', 'rel2_template', 'relm_gpt', 'relm_template', 'star_gpt', 'star_template'] #
      sgcaption:
        anno_type: ['gpt']
    val:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
    test:
      sources: ['scanrefer']
      referit3d:
        anno_type: ['sr3d'] # 'nr3d', 'sr3d'
        sr3d_plus_aug: False
      sgrefer:
        anno_type: ['template'] # 'template', 'gpt', 'gpt_chain'
      sgcaption:
        anno_type: ['gpt']
  RScanSpatialRefer:
    train:
      sources: ['rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  MultiScanSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template', 'star_gpt' ]
  ARKitSceneSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  HMSpatialRefer:
    train:
      sources: ['anno','rel2_template','rel2_gpt','relm_template','relm_gpt','star_template','star_gpt']
    val:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
    test:
      sources: [ 'anno', 'rel2_template', 'relm_gpt', 'relm_template', 'star_template' ]
  use_voxel: False
  scan_family_base: "/scratch/masaccio/existing_datasets/scannet"
  rscan_base: "/scratch/masaccio/existing_datasets/3RScan-base"
  arkitscene_base: '/scratch/masaccio/existing_datasets/ARKitScenes'
  multiscan_base: '/scratch/masaccio/existing_datasets/multiscan'
  hm_base: '/scratch/masaccio/existing_datasets/HM3D'

data_aug:
  aug_list: ['scene_aug']
  scene_aug:
    translation:
      enabled: False
      value: [1.0, 1.0, 1.0]
      p: 1.0
    scaling:
      enabled: False
      p: 1.0
      value: [0.9, 1.1]
    flip:
      enabled: False
      p: 0.5
    rotation:
      enabled: True
      p: 1.0
      axis_align: True
      value: [0.0, 0.0, 1.0]
      shuffle: True
    color_jitter: False
    order_shuffle: False
  obj_aug:
    translation:
      enabled: False
      value: [0.1, 0.1, 0.1]
      p: 1.0
    rotation:
      enabled: False
      p: 1.0
      axis_align: False
      value: [0.0, 0.0, 0.1]
      shuffle: True
    random_jitter:
      enabled: False
      value: 0.01
      accord_to_size: False
      p: 1.0
    pts_shuffle: True

# task details: 'Pretrain', 'scanqa', 'spatialrefer'
task: 'Pretrain'
# 'MaskDatasetWrapper', 'ScanFamilyDatasetWrapper', 'MaskMVDatasetWrapper'
data_wrapper:
  train: 'MaskDatasetWrapper'
  val: 'ScanFamilyDatasetWrapperOld'
  test: 'ScanFamilyDatasetWrapperOld'

# Training details
trainer: "OpenVocabTrainer"
ckpt_path: ""
pretrain_ckpt_path: ""

# dataloader details
dataloader:
  batchsize: 64
  num_workers: 4
  balance_dataset: False
  filter_empty_annotations: False

solver:
  gradient_accumulation_steps: 1
  epochs_per_save: 20
  epochs_per_eval: 1
  lr: 5e-4
  grad_norm: 5.0
  epochs: 150
  optim:
    name: 'AdamW'
    args:
      betas: [0.9, 0.98]
  sched:
    name: 'warmup_cosine'
    args:
      warmup_steps: 500
      minimum_ratio: 0.1

eval:
  train:
    name: 'PretrainEval'
  val:
    name: 'ScanReferEval'
  save: False


# Model details
model:
  name: OpenVocab
  language:
    # This part could be further optimized to be using
    # huggingface yaml config files
    name: 'BERTLanguageEncoder'
    args:
      weights: 'bert-base-uncased'
      hidden_size: 768
      num_hidden_layers: 4
      num_attention_heads: 12
      type_vocab_size: 2
    lr: 1e-5
  vision:
    name: 'PointOpenVocabEncoder'
    args:
        backbone: 'pointnet++'
        hidden_size: 768
        freeze: True
        path: '/scratch/masaccio/code/pretrained_weights/objpretrain/pointnetpp-open-bert'
        num_attention_heads: 12
        spatial_dim: 5
        num_layers: 4
        dim_loc: 6
        dim_feedforward: 2048
        attn_type: spatial
        pairwise_rel_type: 'center'
        use_matmul_label: False
        lang_type: 'bert'
        lang_path: '/scratch/masaccio/607_text_embeddings'
    lr: 1e-4
  grounding:
    name: 'UnifiedSpatialCrossEncoderV2'
    args:
      hidden_size: 768
      num_attention_heads: 12
      num_layers: 4
      dim_feedforward: 2048
      dim_loc: 6
    lr: 1e-4
  inter: before
  heads:
    head_list: ['pretrain_head']
    pretrain_head:
      name: 'OVPretrainHead'
      args:
        hidden_size: 768
        vocab_size: 30522
  loss_type: 'ListLoss'
  loss_list: [
      'lm_cls_loss',
      'TextObjWithinBatch',
#      'TextObjBetweenBatch',
      'TextSceneBetweenBatch'
  ]
  vis_loss_list: [
      'lm_cls_loss

Download .txt

gitextract_y3zcu4ix/

├── .gitignore
├── DATA.md
├── LICENSE
├── README.md
├── TRAIN.md
├── common/
│   ├── box_utils.py
│   ├── dist_utils.py
│   ├── io_utils.py
│   ├── launch_utils.py
│   ├── misc.py
│   └── type_utils.py
├── configs/
│   └── final/
│       ├── all_anno.yaml
│       ├── all_nomlm.yaml
│       ├── all_noobj.yaml
│       ├── all_noscene.yaml
│       ├── all_pretrain.yaml
│       ├── all_pretrain_125.yaml
│       ├── all_pretrain_25.yaml
│       ├── all_pretrain_50.yaml
│       ├── all_pretrain_75.yaml
│       ├── all_pretrain_objcap.yaml
│       ├── all_pretrain_objcap_notemplate.yaml
│       ├── all_pretrain_s3d.yaml
│       ├── all_pretrain_unfreeze.yaml
│       ├── all_rewrite.yaml
│       ├── all_template.yaml
│       ├── all_wo_both.yaml
│       ├── all_wo_both_125.yaml
│       ├── all_wo_both_25.yaml
│       ├── all_wo_both_50.yaml
│       ├── all_wo_multiscan.yaml
│       ├── all_wo_scannet.yaml
│       ├── debug.yaml
│       ├── finetune/
│       │   ├── multiscan_finetune.yaml
│       │   ├── multiscan_woL.yaml
│       │   ├── nr3d_finetune.yaml
│       │   ├── scannet_woL.yaml
│       │   ├── scanqa_finetune.yaml
│       │   ├── scanrefer_finetune.yaml
│       │   ├── sqa3d_finetune.yaml
│       │   └── sr3d_finetune.yaml
│       ├── multiscan_only.yaml
│       ├── nr3d_only.yaml
│       ├── procthor_only.yaml
│       ├── s3d_only.yaml
│       ├── scanrefer_only.yaml
│       ├── scanrefer_only_gttest.yaml
│       └── sr3d_only.yaml
├── data/
│   ├── __init__.py
│   ├── build.py
│   ├── data_utils.py
│   └── datasets/
│       ├── __init__.py
│       ├── arkitscene.py
│       ├── base.py
│       ├── constant.py
│       ├── data_augmentor.py
│       ├── dataset_wrapper.py
│       ├── hm.py
│       ├── multiscan.py
│       ├── procthor.py
│       ├── rscan.py
│       ├── scannet.py
│       ├── scannet_base.py
│       ├── scannet_old.py
│       └── structure3d.py
├── evaluator/
│   ├── __init__.py
│   ├── build.py
│   ├── objcls_eval.py
│   ├── pretrain_eval.py
│   ├── referit3d_eval.py
│   ├── scanqa_eval.py
│   ├── scanrefer_eval.py
│   └── sqa3d_eval.py
├── launch.py
├── model/
│   ├── __init__.py
│   ├── build.py
│   ├── objcls.py
│   └── openvocab.py
├── modules/
│   ├── __init__.py
│   ├── build.py
│   ├── grounding/
│   │   ├── __init__.py
│   │   └── unified_encoder.py
│   ├── heads/
│   │   ├── __init__.py
│   │   ├── grounding_head.py
│   │   ├── pretrain_head.py
│   │   └── qa_head.py
│   ├── language/
│   │   ├── __init__.py
│   │   ├── bert.py
│   │   └── clip.py
│   ├── layers/
│   │   ├── pointnet.py
│   │   └── transformers.py
│   ├── third_party/
│   │   ├── __init__.py
│   │   └── pointnet2/
│   │       ├── _ext_src/
│   │       │   ├── include/
│   │       │   │   ├── ball_query.h
│   │       │   │   ├── cuda_utils.h
│   │       │   │   ├── group_points.h
│   │       │   │   ├── interpolate.h
│   │       │   │   ├── sampling.h
│   │       │   │   └── utils.h
│   │       │   └── src/
│   │       │       ├── ball_query.cpp
│   │       │       ├── ball_query_gpu.cu
│   │       │       ├── bindings.cpp
│   │       │       ├── group_points.cpp
│   │       │       ├── group_points_gpu.cu
│   │       │       ├── interpolate.cpp
│   │       │       ├── interpolate_gpu.cu
│   │       │       ├── sampling.cpp
│   │       │       └── sampling_gpu.cu
│   │       ├── _version.py
│   │       ├── pointnet2_modules.py
│   │       ├── pointnet2_test.py
│   │       ├── pointnet2_utils.py
│   │       ├── pytorch_utils.py
│   │       ├── requirements_new.txt
│   │       └── setup.py
│   ├── utils.py
│   ├── vision/
│   │   ├── __init__.py
│   │   ├── obj_cls_encoder.py
│   │   └── pcd_openvocab_encoder.py
│   └── weights.py
├── optim/
│   ├── __init__.py
│   ├── build.py
│   ├── loss/
│   │   ├── __init__.py
│   │   ├── contra_loss.py
│   │   └── loss.py
│   ├── optimizer/
│   │   ├── __init__.py
│   │   └── optim.py
│   ├── scheduler.py
│   └── utils.py
├── preprocess/
│   ├── README.md
│   ├── __init__.py
│   ├── arkitscenes.py
│   ├── build.py
│   ├── multiscan.py
│   ├── rscan.py
│   ├── sceneverse2hmsemantic.py
│   ├── ssg/
│   │   ├── README.md
│   │   ├── relationships/
│   │   │   ├── camera.py
│   │   │   ├── hanging.py
│   │   │   ├── init.py
│   │   │   ├── multi_objs.py
│   │   │   ├── proximity.py
│   │   │   └── support.py
│   │   ├── ssg_data/
│   │   │   ├── dictionary.py
│   │   │   ├── script/
│   │   │   │   └── ObjNode.py
│   │   │   └── ssg_visualize.py
│   │   ├── ssg_main.py
│   │   └── ssg_utils.py
│   ├── structured3d.py
│   └── utils/
│       ├── __init__.py
│       ├── align_utils.py
│       ├── constant.py
│       └── label_convert.py
├── requirements.txt
├── run.py
├── trainer/
│   ├── __init__.py
│   ├── build.py
│   ├── debug_trainer.py
│   ├── default_trainer.py
│   ├── objpretrain_trainer.py
│   └── openvocab_trainer.py
└── visualize_data.py

Download .txt

SYMBOL INDEX (606 symbols across 84 files)

FILE: common/box_utils.py
  function box3d_iou (line 4) | def box3d_iou(corners1, corners2):
  function get_box3d_min_max (line 30) | def get_box3d_min_max(corner):
  function get_3d_box (line 49) | def get_3d_box(center, box_size):

FILE: common/dist_utils.py
  function is_dist_avail_and_initialized (line 11) | def is_dist_avail_and_initialized():
  function get_rank (line 19) | def get_rank():
  function get_world_size (line 28) | def get_world_size():
  function is_master_proc (line 37) | def is_master_proc(num_gpus=8):
  function is_root_proc (line 47) | def is_root_proc():
  function _serialize_to_tensor (line 59) | def _serialize_to_tensor(data, group, max_size=1024):
  function _pad_to_largest_tensor (line 85) | def _pad_to_largest_tensor(tensor, group):
  function broadcast (line 121) | def broadcast(object):
  function all_gather (line 131) | def all_gather(tensors):
  function all_reduce (line 152) | def all_reduce(tensors, average=True):
  function _get_global_gloo_group (line 171) | def _get_global_gloo_group():
  function all_gather_unaligned (line 184) | def all_gather_unaligned(data, group=None):

FILE: common/io_utils.py
  function make_dir (line 12) | def make_dir(dir_path):
  function load_imgs (line 17) | def load_imgs(img_paths, option=cv2.IMREAD_COLOR):
  function load_pickle (line 22) | def load_pickle(filename):
  function save_pickle (line 27) | def save_pickle(data, filename):
  function load_json (line 32) | def load_json(filename):
  function save_json (line 37) | def save_json(data, filename, save_pretty=True, sort_keys=False):
  function load_jsonl (line 45) | def load_jsonl(filename):
  function save_jsonl (line 50) | def save_jsonl(data, filename):
  function load_yaml (line 55) | def load_yaml(filename):
  function save_yaml (line 60) | def save_yaml(data, filename):
  function load_csv (line 65) | def load_csv(filename, delimiter=","):
  function save_csv (line 81) | def save_csv(data, filename, cols=None, delimiter=","):
  function load_numpy (line 92) | def load_numpy(filename):
  function save_numpy (line 96) | def save_numpy(data, filename):
  function load_tensor (line 100) | def load_tensor(filename):
  function save_tensor (line 104) | def save_tensor(data, filename):
  function load_ply (line 108) | def load_ply(filepath):
  function load_ply_with_normals (line 122) | def load_ply_with_normals(filepath):

FILE: common/launch_utils.py
  class SubmititLauncher (line 11) | class SubmititLauncher:
    method __init__ (line 12) | def __init__(self, args):
    method __call__ (line 15) | def __call__(self):
    method _set_gpu_args (line 44) | def _set_gpu_args(self):
  function submitit_launch (line 50) | def submitit_launch(args):
  function accelerate_launch (line 80) | def accelerate_launch(args):
  function python_launch (line 106) | def python_launch(args):

FILE: common/misc.py
  function rsetattr (line 16) | def rsetattr(obj, attr, val):
  function rgetattr (line 22) | def rgetattr(obj, attr, *args):
  function _gpu_gather_object (line 43) | def _gpu_gather_object(object: Any):
  function gather_object (line 65) | def gather_object(object: Any):
  function gather_for_metrics (line 84) | def gather_for_metrics(accelerator, input_data):
  function gather_dict (line 134) | def gather_dict(accelerator, data_dict):

FILE: common/type_utils.py
  function cfg2dict (line 6) | def cfg2dict(cfg):
  function _to_device (line 10) | def _to_device(state, device):

FILE: data/build.py
  function get_dataset (line 14) | def get_dataset(cfg, split):
  function build_dataloader (line 42) | def build_dataloader(cfg, split='train'):

FILE: data/data_utils.py
  function per_scene_pad (line 12) | def per_scene_pad(lang_list, max_len=64, tokenizer=None, max_seq_len=50):
  function merge_tokens (line 50) | def merge_tokens(token1, mask1, token2, mask2, max_len=300, tokenizer=No...
  function convert_pc_to_box (line 63) | def convert_pc_to_box(obj_pc):
  function random_word (line 76) | def random_word(tokens, tokens_mask, tokenizer, mask_ratio):
  function random_point_cloud (line 107) | def random_point_cloud(pcd, pcd_mask, mask_ratio):
  class LabelConverter (line 124) | class LabelConverter(object):
    method __init__ (line 125) | def __init__(self, file_path):
  function build_rotate_mat (line 163) | def build_rotate_mat(split, rot_aug=True, rand_angle='axis'):
  function eval_ref_one_sample (line 181) | def eval_ref_one_sample(pred_bbox, gt_bbox):
  function get_box3d_min_max (line 194) | def get_box3d_min_max(corner):
  function box3d_iou (line 212) | def box3d_iou(corners1, corners2):
  function transform_points (line 237) | def transform_points(points, transform, translate=True, mode="numpy"):
  function construct_bbox_corners (line 266) | def construct_bbox_corners(center, box_size):
  function is_explicitly_view_dependent (line 280) | def is_explicitly_view_dependent(tokens):
  class ScanQAAnswer (line 292) | class ScanQAAnswer(object):
    method __init__ (line 293) | def __init__(self, answers=None, unk_token='<unk>', ignore_idx=-100):
    method itos (line 301) | def itos(self, i):
    method stoi (line 306) | def stoi(self, v):
    method __len__ (line 311) | def __len__(self):
  class SQA3DAnswer (line 315) | class SQA3DAnswer(object):
    method __init__ (line 316) | def __init__(self, answers=None, unk_token='u'):
    method itos (line 324) | def itos(self, i):
    method stoi (line 329) | def stoi(self, v):
    method __len__ (line 334) | def __len__(self):
  function load_matrix_from_txt (line 338) | def load_matrix_from_txt(path, shape=(4, 4)):
  function pad_tensors (line 345) | def pad_tensors(tensors, lens=None, pad=0):
  function get_sqa_question_type (line 355) | def get_sqa_question_type(question):
  class Vocabulary (line 371) | class Vocabulary(object):
    method __init__ (line 372) | def __init__(self, path=None):
    method add_token (line 383) | def add_token(self, token, bert_id):
    method token_to_id (line 391) | def token_to_id(self, token):
    method id_to_token (line 394) | def id_to_token(self, id):
    method id_to_bert_id (line 397) | def id_to_bert_id(self, id):
    method save_vocab (line 400) | def save_vocab(self, path):
  function random_caption_word (line 406) | def random_caption_word(tokens, tokens_mask, tokenizer, vocab, mask_ratio):
  function clean_answer (line 428) | def clean_answer(data):

FILE: data/datasets/arkitscene.py
  class ARKitScenePretrainObj (line 8) | class ARKitScenePretrainObj(ScanBase):
    method __init__ (line 9) | def __init__(self, cfg, split):
    method __len__ (line 28) | def __len__(self):
    method __getitem__ (line 31) | def __getitem__(self, index):
  class ARKitSceneSpatialRefer (line 45) | class ARKitSceneSpatialRefer(ScanBase):
    method __init__ (line 46) | def __init__(self, cfg, split):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, index):

FILE: data/datasets/base.py
  class ScanBase (line 20) | class ScanBase(Dataset):
    method __init__ (line 21) | def __init__(self, cfg, split):
    method _load_split (line 52) | def _load_split(self, split):
    method _load_scan (line 65) | def _load_scan(self, scan_ids, filter_bkg=False):
    method _load_lang (line 144) | def _load_lang(self, cfg, scan_ids):
    method _getitem_pretrain (line 215) | def _getitem_pretrain(self, index, is_rscan=False):
    method _getitem_obj_pretrain (line 263) | def _getitem_obj_pretrain(self, index):
    method _getitem_refer (line 324) | def _getitem_refer(self, index):
    method _getitem_perscene (line 514) | def _getitem_perscene(self, index):
    method _obj_processing_post (line 697) | def _obj_processing_post(self, obj_pcds, obj_labels, is_need_bbox=Fals...
    method _obj_processing_aug (line 742) | def _obj_processing_aug(self, obj_pcds, obj_labels, is_need_bbox=False):
    method _scene_processing_aug (line 778) | def _scene_processing_aug(self, obj_pcds, bg_pcds, obj_labels, is_need...
    method _getitem_finalrefer (line 840) | def _getitem_finalrefer(self, index):

FILE: data/datasets/data_augmentor.py
  class DataAugmentor (line 8) | class DataAugmentor(object):
    method __init__ (line 9) | def __init__(self, cfg, split, **kwargs):
    method forward (line 23) | def forward(self, data_dict):
    method scene_aug (line 39) | def scene_aug(self, aug_dict, config):
    method obj_aug (line 105) | def obj_aug(self, aug_dict, config):
    method update_data_dict (line 156) | def update_data_dict(self, data_dict, aug_dict):
    method init_aug (line 218) | def init_aug(obj_len):
    method check_key (line 226) | def check_key(key):
    method check_p (line 241) | def check_p(key):
    method rot_fn (line 245) | def rot_fn(x, mat):
    method obj_rot_fn (line 249) | def obj_rot_fn(x, mat):
    method scaling_fn (line 254) | def scaling_fn(x, scale):
    method jitter_fn (line 259) | def jitter_fn(x, scale):
    method subsample_fn (line 263) | def subsample_fn(x, num_points):
    method pts_shuffle_fn (line 268) | def pts_shuffle_fn(x):

FILE: data/datasets/dataset_wrapper.py
  class MaskDatasetWrapper (line 16) | class MaskDatasetWrapper(Dataset):
    method __init__ (line 17) | def __init__(self, cfg, dataset, split="train"):
    method __len__ (line 35) | def __len__(self):
    method __getitem__ (line 38) | def __getitem__(self, idx):
    method collate_fn (line 113) | def collate_fn(self, batch_list):
  class ScanFamilyDatasetWrapperOld (line 119) | class ScanFamilyDatasetWrapperOld(Dataset):
    method __init__ (line 120) | def __init__(self, cfg, dataset, split="train"):
    method __len__ (line 134) | def __len__(self):
    method pad_tensors (line 137) | def pad_tensors(self, tensors, lens=None, pad=0):
    method __getitem__ (line 147) | def __getitem__(self, idx):
    method collate_fn (line 197) | def collate_fn(self, batch_list):
  class VisualizeDatasetWrapper (line 203) | class VisualizeDatasetWrapper(Dataset):
    method __init__ (line 204) | def __init__(self, cfg, dataset, split="train"):
    method __len__ (line 207) | def __len__(self):
    method __getitem__ (line 210) | def __getitem__(self, idx):
    method collate_fn (line 217) | def collate_fn(self, batch_list):

FILE: data/datasets/hm.py
  class HMPretrainObj (line 8) | class HMPretrainObj(ScanBase):
    method __init__ (line 9) | def __init__(self, cfg, split):
    method __len__ (line 28) | def __len__(self):
    method __getitem__ (line 31) | def __getitem__(self, index):
  class HMSpatialRefer (line 45) | class HMSpatialRefer(ScanBase):
    method __init__ (line 46) | def __init__(self, cfg, split):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, index):

FILE: data/datasets/multiscan.py
  class MultiScanPretrainObj (line 8) | class MultiScanPretrainObj(ScanBase):
    method __init__ (line 9) | def __init__(self, cfg, split):
    method __len__ (line 28) | def __len__(self):
    method __getitem__ (line 31) | def __getitem__(self, index):
  class MultiScanSpatialRefer (line 45) | class MultiScanSpatialRefer(ScanBase):
    method __init__ (line 46) | def __init__(self, cfg, split):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, index):

FILE: data/datasets/procthor.py
  class ProcThorPretrainObj (line 8) | class ProcThorPretrainObj(ScanBase):
    method __init__ (line 9) | def __init__(self, cfg, split):
    method __len__ (line 28) | def __len__(self):
    method __getitem__ (line 31) | def __getitem__(self, index):
  class ProcThorSpatialRefer (line 45) | class ProcThorSpatialRefer(ScanBase):
    method __init__ (line 46) | def __init__(self, cfg, split):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, index):

FILE: data/datasets/rscan.py
  class RScanPretrainObj (line 8) | class RScanPretrainObj(ScanBase):
    method __init__ (line 9) | def __init__(self, cfg, split):
    method __len__ (line 28) | def __len__(self):
    method __getitem__ (line 31) | def __getitem__(self, index):
  class RScanSpatialRefer (line 45) | class RScanSpatialRefer(ScanBase):
    method __init__ (line 46) | def __init__(self, cfg, split):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, index):

FILE: data/datasets/scannet.py
  class ScanNetPretrainObj (line 9) | class ScanNetPretrainObj(ScanBase):
    method __init__ (line 10) | def __init__(self, cfg, split):
    method __len__ (line 29) | def __len__(self):
    method __getitem__ (line 32) | def __getitem__(self, index):
  class ScanNetSpatialRefer (line 46) | class ScanNetSpatialRefer(ScanBase):
    method __init__ (line 47) | def __init__(self, cfg, split):
    method __len__ (line 78) | def __len__(self):
    method __getitem__ (line 81) | def __getitem__(self, index):

FILE: data/datasets/scannet_base.py
  class ScanNetBase (line 22) | class ScanNetBase(Dataset):
    method __init__ (line 23) | def __init__(self, cfg, split):
    method __len__ (line 40) | def __len__(self):
    method __getitem__ (line 43) | def __getitem__(self, index):
    method _load_one_scan (line 46) | def _load_one_scan(self, scan_id, pc_type = 'gt', load_inst_info = False,
    method _load_scannet (line 135) | def _load_scannet(self, scan_ids, pc_type = 'gt', load_inst_info = False,
    method _load_lang (line 170) | def _load_lang(self, cfg):
    method _load_split (line 221) | def _load_split(self, cfg, split, use_multi_process = False):
    method _load_inst_info (line 235) | def _load_inst_info(self, scan_id):
    method _obj_processing_post (line 263) | def _obj_processing_post(self, obj_pcds, obj_labels, is_need_bbox=Fals...
    method _obj_processing_aug (line 308) | def _obj_processing_aug(self, obj_pcds, obj_labels, is_need_bbox=False):
    method _scene_processing_aug (line 344) | def _scene_processing_aug(self, obj_pcds, bg_pcds, obj_labels, is_need...
    method _get_pooling_obj_feature (line 407) | def _get_pooling_obj_feature(self, args, mv_info_all, sampled_frame_na...
    method init_dataset_params (line 448) | def init_dataset_params(self, dataset_cfg):
    method init_scan_data (line 465) | def init_scan_data(self):
    method get_scene (line 479) | def get_scene(self, scan_id, tgt_object_id_list, tgt_object_name_list,...

FILE: data/datasets/scannet_old.py
  class ScanNetSQA3D (line 23) | class ScanNetSQA3D(ScanNetBase):
    method __init__ (line 50) | def __init__(self, cfg, split):
    method __getitem__ (line 94) | def __getitem__(self, index):
    method build_answer (line 237) | def build_answer(self):
    method _load_lang (line 252) | def _load_lang(self):
    method _load_question (line 270) | def _load_question(self):
  class ScanNetScanQAOld (line 287) | class ScanNetScanQAOld(ScanNetBase):
    method __init__ (line 288) | def __init__(self, cfg, split):
    method __getitem__ (line 326) | def __getitem__(self, index):
    method _load_lang (line 486) | def _load_lang(self):
    method build_answer (line 504) | def build_answer(self):

FILE: data/datasets/structure3d.py
  class S3DPretrainObj (line 8) | class S3DPretrainObj(ScanBase):
    method __init__ (line 9) | def __init__(self, cfg, split):
    method __len__ (line 28) | def __len__(self):
    method __getitem__ (line 31) | def __getitem__(self, index):
  class S3DSpatialRefer (line 45) | class S3DSpatialRefer(ScanBase):
    method __init__ (line 46) | def __init__(self, cfg, split):
    method __len__ (line 77) | def __len__(self):
    method __getitem__ (line 80) | def __getitem__(self, index):

FILE: evaluator/build.py
  class BaseEvaluator (line 11) | class BaseEvaluator():
    method __init__ (line 12) | def __init__(self, cfg, accelerator):
    method reset (line 19) | def reset(self):
    method batch_metrics (line 23) | def batch_metrics(self, data_dict, include_count=False):
    method update (line 26) | def update(self, data_dict):
    method record (line 33) | def record(self):
  function get_eval (line 57) | def get_eval(name, cfg, accelerator, **kwargs):
  function build_eval (line 65) | def build_eval(cfg, accelerator, **kwargs):

FILE: evaluator/objcls_eval.py
  class PretrainObjEval (line 8) | class PretrainObjEval(BaseEvaluator):
    method __init__ (line 9) | def __init__(self, cfg, accelerator, **kwargs):
    method batch_metrics (line 14) | def batch_metrics(self, data_dict, include_count=False):

FILE: evaluator/pretrain_eval.py
  class PretrainEval (line 8) | class PretrainEval(BaseEvaluator):
    method __init__ (line 9) | def __init__(self, cfg, accelerator, **kwargs):
    method batch_metrics (line 21) | def batch_metrics(self, data_dict):
    method update (line 69) | def update(self, data_dict):
    method record (line 77) | def record(self):
    method reset (line 88) | def reset(self):

FILE: evaluator/referit3d_eval.py
  class ReferIt3DEval (line 8) | class ReferIt3DEval(BaseEvaluator):
    method __init__ (line 9) | def __init__(self, cfg, accelerator, **kwargs):
    method batch_metrics (line 14) | def batch_metrics(self, data_dict, include_count=False):

FILE: evaluator/scanqa_eval.py
  class ScanQAEval (line 15) | class ScanQAEval(BaseEvaluator):
    method __init__ (line 16) | def __init__(self, cfg, accelerator, **kwargs):
    method batch_metrics (line 29) | def batch_metrics(self, data_dict, include_count=False):
  class ScanQAGenEval (line 85) | class ScanQAGenEval(ScanQAEval):
    method __init__ (line 86) | def __init__(self, cfg, accelerator, **kwargs):
    method batch_metrics (line 89) | def batch_metrics(self, data_dict, include_count=False):

FILE: evaluator/scanrefer_eval.py
  class ScanReferEval (line 8) | class ScanReferEval(BaseEvaluator):
    method __init__ (line 9) | def __init__(self, cfg, accelerator, **kwargs):
    method batch_metrics (line 14) | def batch_metrics(self, data_dict, include_count=False):

FILE: evaluator/sqa3d_eval.py
  class SQA3DEval (line 14) | class SQA3DEval():
    method __init__ (line 16) | def __init__(self, cfg, task_name):
    method update (line 49) | def update(self, data_dict):
    method batch_metrics (line 78) | def batch_metrics(self, data_dict):
    method reset (line 118) | def reset(self):
    method record (line 129) | def record(self, split='val'):

FILE: launch.py
  function parse_args (line 6) | def parse_args():
  function main (line 62) | def main():

FILE: model/build.py
  class BaseModel (line 8) | class BaseModel(nn.Module):
    method __init__ (line 9) | def __init__(self, cfg):
    method get_opt_params (line 12) | def get_opt_params(self):
  function build_model (line 16) | def build_model(cfg):

FILE: model/objcls.py
  class ObjCls (line 17) | class ObjCls(BaseModel):
    method __init__ (line 18) | def __init__(self, cfg):
    method forward (line 64) | def forward(self, data_dict):
    method get_opt_params (line 90) | def get_opt_params(self):

FILE: model/openvocab.py
  class OpenVocab (line 12) | class OpenVocab(BaseModel):
    method __init__ (line 13) | def __init__(self, cfg):
    method forward (line 26) | def forward(self, data_dict):
    method get_opt_params (line 103) | def get_opt_params(self):
  class OpenVocabPerScene (line 130) | class OpenVocabPerScene(BaseModel):
    method __init__ (line 131) | def __init__(self, cfg):
    method forward (line 141) | def forward(self, data_dict):
    method get_opt_params (line 232) | def get_opt_params(self):

FILE: modules/build.py
  function build_module (line 12) | def build_module(module_type, cfg):
  function build_module_by_name (line 24) | def build_module_by_name(cfg):

FILE: modules/grounding/unified_encoder.py
  class EntitySpatialCrossEncoder (line 13) | class EntitySpatialCrossEncoder(nn.Module):
    method __init__ (line 19) | def __init__(self, cfg, hidden_size=768, num_attention_heads=12, spati...
    method forward (line 37) | def forward(
  class UnifiedSpatialCrossEncoderV1 (line 61) | class UnifiedSpatialCrossEncoderV1(nn.Module):
    method __init__ (line 67) | def __init__(self, cfg, hidden_size=768, num_attention_heads=12, spati...
    method forward (line 90) | def forward(
  class UnifiedSpatialCrossEncoderV2 (line 122) | class UnifiedSpatialCrossEncoderV2(nn.Module):
    method __init__ (line 128) | def __init__(self, cfg, hidden_size=768, dim_feedforward=2048, num_att...
    method forward (line 147) | def forward(

FILE: modules/heads/grounding_head.py
  class GroundHeadV1 (line 8) | class GroundHeadV1(nn.Module):
    method __init__ (line 9) | def __init__(self, cfg, input_size=768, hidden_size=768, sem_cls_size=...
    method forward (line 29) | def forward(self, txt_embeds, obj_embeds, obj_pre_embeds, obj_masks, *...
  class GroundHead (line 43) | class GroundHead(nn.Module):
    method __init__ (line 44) | def __init__(self, cfg, input_size=768, hidden_size=768, dropout=0.3):
    method forward (line 51) | def forward(self, obj_embeds, obj_masks=None, **kwargs):

FILE: modules/heads/pretrain_head.py
  class BertPredictionHeadTransform (line 8) | class BertPredictionHeadTransform(nn.Module):
    method __init__ (line 9) | def __init__(self, hidden_size, hidden_act='gelu'):
    method forward (line 15) | def forward(self, hidden_states):
  class BertLMPredictionHead (line 22) | class BertLMPredictionHead(nn.Module):
    method __init__ (line 23) | def __init__(self, hidden_size, vocab_size):
    method forward (line 29) | def forward(self, hidden_states):
  class PretrainHeadV1 (line 36) | class PretrainHeadV1(nn.Module):
    method __init__ (line 37) | def __init__(self, cfg, hidden_size=768, vocab_size=30522):
    method forward (line 41) | def forward(self, txt_embeds, **kwargs):
  class OVPretrainHead (line 47) | class OVPretrainHead(nn.Module):
    method __init__ (line 48) | def __init__(self, cfg, hidden_size=768, vocab_size=30522, obj_vocab_s...
    method forward (line 53) | def forward(self, txt_embeds, obj_embeds, **kwargs):

FILE: modules/heads/qa_head.py
  class FC (line 8) | class FC(nn.Module):
    method __init__ (line 9) | def __init__(self, in_size, out_size, pdrop=0., use_gelu=True):
    method forward (line 20) | def forward(self, x):
  class MLP (line 30) | class MLP(nn.Module):
    method __init__ (line 31) | def __init__(self, in_size, mid_size, out_size, pdrop=0., use_gelu=True):
    method forward (line 36) | def forward(self, x):
  class AttFlat (line 40) | class AttFlat(nn.Module):
    method __init__ (line 41) | def __init__(self, hidden_size, flat_mlp_size=512, flat_glimpses=1, fl...
    method forward (line 56) | def forward(self, x, x_mask):
  class QAHeadV1 (line 73) | class QAHeadV1(nn.Module):
    method __init__ (line 74) | def __init__(self, cfg, hidden_size=768, mlp_size=256, glimpse=1, flat...
    method forward (line 86) | def forward(self, obj_embeds, obj_masks, txt_embeds, txt_masks, **kwar...

FILE: modules/language/bert.py
  class BERTLanguageEncoder (line 8) | class BERTLanguageEncoder(nn.Module):
    method __init__ (line 9) | def __init__(self, cfg, weights="bert-base-uncased", hidden_size=768,
    method forward (line 25) | def forward(self, txt_ids, txt_masks, **kwargs):

FILE: modules/language/clip.py
  class CLIPLanguageEncoder (line 10) | class CLIPLanguageEncoder(nn.Module):
    method __init__ (line 11) | def __init__(self, cfg, weights="openai/clip-vit-large-patch14", outpu...
    method forward (line 20) | def forward(self, txt_ids, txt_masks):

FILE: modules/layers/pointnet.py
  function break_up_pc (line 6) | def break_up_pc(pc):
  class PointNetPP (line 22) | class PointNetPP(nn.Module):
    method __init__ (line 28) | def __init__(self, sa_n_points: list,
    method forward (line 55) | def forward(self, features):

FILE: modules/layers/transformers.py
  class CrossAttentionLayer (line 12) | class CrossAttentionLayer(nn.Module):
    method __init__ (line 14) | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, ...
    method forward (line 39) | def forward(
  class TransformerDecoderLayer (line 66) | class TransformerDecoderLayer(nn.Module):
    method __init__ (line 67) | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, ...
    method forward (line 89) | def forward(
  class TransformerEncoderLayer (line 115) | class TransformerEncoderLayer(nn.Module):
    method __init__ (line 116) | def __init__(self, d_model, nhead, dim_feedforward=2048, batch_first=T...
    method forward (line 134) | def forward(
  class MultiHeadAttentionSpatial (line 157) | class MultiHeadAttentionSpatial(nn.Module):
    method __init__ (line 158) | def __init__(
    method forward (line 188) | def forward(self, q, k, v, pairwise_locs, key_padding_mask=None, txt_e...
  class TransformerSpatialDecoderLayer (line 242) | class TransformerSpatialDecoderLayer(TransformerDecoderLayer):
    method __init__ (line 243) | def __init__(
    method forward (line 258) | def forward(
  class TransformerSpatialEncoderLayer (line 285) | class TransformerSpatialEncoderLayer(TransformerEncoderLayer):
    method __init__ (line 286) | def __init__(
    method forward (line 301) | def forward(

FILE: modules/third_party/pointnet2/_ext_src/include/cuda_utils.h
  function opt_n_threads (line 15) | inline int opt_n_threads(int work_size) {
  function dim3 (line 21) | inline dim3 opt_block_config(int x, int y) {

FILE: modules/third_party/pointnet2/_ext_src/src/ball_query.cpp
  function ball_query (line 8) | at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float ra...

FILE: modules/third_party/pointnet2/_ext_src/src/bindings.cpp
  function PYBIND11_MODULE (line 6) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: modules/third_party/pointnet2/_ext_src/src/group_points.cpp
  function group_points (line 12) | at::Tensor group_points(at::Tensor points, at::Tensor idx) {
  function group_points_grad (line 38) | at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const ...

FILE: modules/third_party/pointnet2/_ext_src/src/interpolate.cpp
  function three_nn (line 14) | std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows) {
  function three_interpolate (line 42) | at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
  function three_interpolate_grad (line 71) | at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,

FILE: modules/third_party/pointnet2/_ext_src/src/sampling.cpp
  function gather_points (line 15) | at::Tensor gather_points(at::Tensor points, at::Tensor idx) {
  function gather_points_grad (line 40) | at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx,
  function furthest_point_sampling (line 66) | at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) {

FILE: modules/third_party/pointnet2/pointnet2_modules.py
  class _PointnetSAModuleBase (line 26) | class _PointnetSAModuleBase(nn.Module):
    method __init__ (line 28) | def __init__(self):
    method forward (line 34) | def forward(self, xyz: torch.Tensor,
  class PointnetSAModuleMSG (line 78) | class PointnetSAModuleMSG(_PointnetSAModuleBase):
    method __init__ (line 95) | def __init__(
  class PointnetSAModule (line 127) | class PointnetSAModule(PointnetSAModuleMSG):
    method __init__ (line 144) | def __init__(
  class PointnetSAModuleVotes (line 164) | class PointnetSAModuleVotes(nn.Module):
    method __init__ (line 168) | def __init__(
    method forward (line 210) | def forward(self, xyz: torch.Tensor,
  class PointnetSAModuleMSGVotes (line 274) | class PointnetSAModuleMSGVotes(nn.Module):
    method __init__ (line 278) | def __init__(
    method forward (line 309) | def forward(self, xyz: torch.Tensor,
  class PointnetFPModule (line 356) | class PointnetFPModule(nn.Module):
    method __init__ (line 367) | def __init__(self, *, mlp: List[int], bn: bool = True):
    method forward (line 371) | def forward(
  class PointnetLFPModuleMSG (line 418) | class PointnetLFPModuleMSG(nn.Module):
    method __init__ (line 422) | def __init__(
    method forward (line 454) | def forward(self, xyz2: torch.Tensor, xyz1: torch.Tensor,

FILE: modules/third_party/pointnet2/pointnet2_test.py
  function test_interpolation_grad (line 18) | def test_interpolation_grad():

FILE: modules/third_party/pointnet2/pointnet2_utils.py
  class RandomDropout (line 37) | class RandomDropout(nn.Module):
    method __init__ (line 38) | def __init__(self, p=0.5, inplace=False):
    method forward (line 43) | def forward(self, X):
  class FurthestPointSampling (line 48) | class FurthestPointSampling(Function):
    method forward (line 50) | def forward(ctx, xyz, npoint):
    method backward (line 73) | def backward(xyz, a=None):
  class GatherOperation (line 80) | class GatherOperation(Function):
    method forward (line 82) | def forward(ctx, features, idx):
    method backward (line 107) | def backward(ctx, grad_out):
  class ThreeNN (line 117) | class ThreeNN(Function):
    method forward (line 119) | def forward(ctx, unknown, known):
    method backward (line 142) | def backward(ctx, a=None, b=None):
  class ThreeInterpolate (line 149) | class ThreeInterpolate(Function):
    method forward (line 151) | def forward(ctx, features, idx, weight):
    method backward (line 177) | def backward(ctx, grad_out):
  class GroupingOperation (line 206) | class GroupingOperation(Function):
    method forward (line 208) | def forward(ctx, features, idx):
    method backward (line 232) | def backward(ctx, grad_out):
  class BallQuery (line 257) | class BallQuery(Function):
    method forward (line 259) | def forward(ctx, radius, nsample, xyz, new_xyz):
    method backward (line 284) | def backward(ctx, a=None):
  class QueryAndGroup (line 291) | class QueryAndGroup(nn.Module):
    method __init__ (line 303) | def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=Fals...
    method forward (line 314) | def forward(self, xyz, new_xyz, features=None):
  class GroupAll (line 376) | class GroupAll(nn.Module):
    method __init__ (line 384) | def __init__(self, use_xyz=True, ret_grouped_xyz=False):
    method forward (line 389) | def forward(self, xyz, new_xyz, features=None):

FILE: modules/third_party/pointnet2/pytorch_utils.py
  class SharedMLP (line 11) | class SharedMLP(nn.Sequential):
    method __init__ (line 13) | def __init__(
  class _BNBase (line 39) | class _BNBase(nn.Sequential):
    method __init__ (line 41) | def __init__(self, in_size, batch_norm=None, name=""):
  class BatchNorm1d (line 49) | class BatchNorm1d(_BNBase):
    method __init__ (line 51) | def __init__(self, in_size: int, *, name: str = ""):
  class BatchNorm2d (line 55) | class BatchNorm2d(_BNBase):
    method __init__ (line 57) | def __init__(self, in_size: int, name: str = ""):
  class BatchNorm3d (line 61) | class BatchNorm3d(_BNBase):
    method __init__ (line 63) | def __init__(self, in_size: int, name: str = ""):
  class _ConvBase (line 67) | class _ConvBase(nn.Sequential):
    method __init__ (line 69) | def __init__(
  class Conv1d (line 123) | class Conv1d(_ConvBase):
    method __init__ (line 125) | def __init__(
  class Conv2d (line 157) | class Conv2d(_ConvBase):
    method __init__ (line 159) | def __init__(
  class Conv3d (line 191) | class Conv3d(_ConvBase):
    method __init__ (line 193) | def __init__(
  class FC (line 225) | class FC(nn.Sequential):
    method __init__ (line 227) | def __init__(
  function set_bn_momentum_default (line 262) | def set_bn_momentum_default(bn_momentum):
  class BNMomentumScheduler (line 271) | class BNMomentumScheduler(object):
    method __init__ (line 273) | def __init__(
    method step (line 291) | def step(self, epoch=None):

FILE: modules/utils.py
  function get_activation_fn (line 12) | def get_activation_fn(activation_type):
  function get_mlp_head (line 18) | def get_mlp_head(input_size, hidden_size, output_size, dropout=0):
  function layer_repeat (line 28) | def layer_repeat(module, N, share_layer=False):
  function calc_pairwise_locs (line 38) | def calc_pairwise_locs(obj_centers, obj_whls, eps=1e-10, pairwise_rel_ty...
  function calc_pairwise_locs_mv (line 89) | def calc_pairwise_locs_mv(obj_centers, pairwise_rel_type='center', spati...
  function get_mixup_function (line 117) | def get_mixup_function(mixup_strategy, mixup_stage1, mixup_stage2):
  class AllMixup (line 128) | class AllMixup(nn.Module):
    method __init__ (line 129) | def __init__(self) -> None:
    method forward (line 132) | def forward(self, obj_sem_cls_pred, obj_labels, cur_step, total_steps):
  class LinearDecayMixup (line 141) | class LinearDecayMixup(nn.Module):
    method __init__ (line 142) | def __init__(self, mixup_stage1, mixup_stage2) -> None:
    method forward (line 148) | def forward(self, obj_sem_cls_pred, obj_labels, cur_step, total_steps):

FILE: modules/vision/obj_cls_encoder.py
  class ObjClsEncoder (line 6) | class ObjClsEncoder(nn.Module):
    method __init__ (line 7) | def __init__(self, cfg, input_feat_size=768, hidden_size=768, tgt_cls_...
    method forward (line 12) | def forward(self, obj_feats, **kwargs):

FILE: modules/vision/pcd_openvocab_encoder.py
  class PointOpenVocabEncoder (line 17) | class PointOpenVocabEncoder(nn.Module):
    method __init__ (line 18) | def __init__(self, cfg, backbone='pointnet++', hidden_size=768, path=N...
    method freeze_bn (line 121) | def freeze_bn(self, m):
    method forward (line 126) | def forward(self, obj_pcds, obj_locs, obj_masks, obj_sem_masks,

FILE: modules/weights.py
  function _init_weights_bert (line 3) | def _init_weights_bert(module, std=0.02):

FILE: optim/build.py
  function build_optim (line 6) | def build_optim(cfg, params, total_steps):

FILE: optim/loss/contra_loss.py
  class TextObjWithinBatch (line 12) | class TextObjWithinBatch(nn.Module):
    method __init__ (line 13) | def __init__(self, cfg):
    method forward (line 18) | def forward(self, data_dict):
  class TextObjBetweenBatch (line 43) | class TextObjBetweenBatch(nn.Module):
    method __init__ (line 44) | def __init__(self, cfg):
    method forward (line 49) | def forward(self, data_dict):
  class TextSceneBetweenBatch (line 75) | class TextSceneBetweenBatch(nn.Module):
    method __init__ (line 76) | def __init__(self, cfg):
    method forward (line 81) | def forward(self, data_dict):

FILE: optim/loss/loss.py
  function og3d_loss (line 8) | def og3d_loss(data_dict):
  function og3d_multi_loss (line 12) | def og3d_multi_loss(data_dict):
  function txt_cls_multi_loss (line 19) | def txt_cls_multi_loss(data_dict):
  function obj_cls_raw_loss (line 26) | def obj_cls_raw_loss(data_dict):
  function obj_cls_pre_loss (line 34) | def obj_cls_pre_loss(data_dict):
  function obj_cls_post_loss (line 42) | def obj_cls_post_loss(data_dict):
  function answer_loss (line 50) | def answer_loss(data_dict):
  function lm_cls_loss (line 56) | def lm_cls_loss(data_dict):
  function obj_cls_pre_loss_mask (line 64) | def obj_cls_pre_loss_mask(data_dict):
  function obj_cls_pre_loss_unmask (line 72) | def obj_cls_pre_loss_unmask(data_dict):
  function obj_cls_post_loss_mask (line 80) | def obj_cls_post_loss_mask(data_dict):
  function obj_cls_post_loss_unmask (line 88) | def obj_cls_post_loss_unmask(data_dict):
  function obj_cls_loss (line 96) | def obj_cls_loss(data_dict, smoothing=0.3):
  function mse_loss (line 105) | def mse_loss(data_dict):
  class Loss (line 111) | class Loss(nn.Module):
    method __init__ (line 112) | def __init__(self, cfg):
    method forward (line 131) | def forward(self, data_dict):

FILE: optim/optimizer/optim.py
  function get_optimizer (line 9) | def get_optimizer(cfg, params):

FILE: optim/scheduler.py
  function warmup_cosine (line 5) | def warmup_cosine(step, warmup_step, total_step, minimum_ratio=1e-5):
  function warmup_exp (line 14) | def warmup_exp(step, warmup_step, total_step, **kwargs):
  function get_scheduler (line 20) | def get_scheduler(cfg, optimizer, total_steps):

FILE: optim/utils.py
  function no_decay_param_group (line 1) | def no_decay_param_group(parameters, lr):

FILE: preprocess/arkitscenes.py
  class ARKitScenesProcessor (line 18) | class ARKitScenesProcessor(ProcessorBase):
    method record_splits (line 19) | def record_splits(self, scan_ids):
    method read_all_scans (line 33) | def read_all_scans(self):
    method process_point_cloud (line 40) | def process_point_cloud(self, scan_id, plydata, annotations):
    method scene_proc (line 87) | def scene_proc(self, scan_id):
    method process_scans (line 104) | def process_scans(self):

FILE: preprocess/build.py
  class ProcessorBase (line 7) | class ProcessorBase:
    method __init__ (line 8) | def __init__(self, cfg):
    method setup_directories (line 22) | def setup_directories(self):
    method log_starting_info (line 27) | def log_starting_info(self, scan_len, e=''):
    method check_key (line 38) | def check_key(key):

FILE: preprocess/multiscan.py
  class MultiScanProcessor (line 18) | class MultiScanProcessor(ProcessorBase):
    method record_splits (line 19) | def record_splits(self, scan_ids, ratio=0.8):
    method read_all_scans (line 37) | def read_all_scans(self):
    method process_point_cloud (line 49) | def process_point_cloud(self, scan_id, plydata, annotations):
    method annotations_to_dataframe_obj (line 96) | def annotations_to_dataframe_obj(annotations):
    method scene_proc (line 110) | def scene_proc(self, scan_id):
    method process_scans (line 119) | def process_scans(self):

FILE: preprocess/rscan.py
  class RScanProcessor (line 18) | class RScanProcessor(ProcessorBase):
    method record_splits (line 19) | def record_splits(self, scan_ids, ratio=0.8):
    method read_all_scans (line 37) | def read_all_scans(self):
    method process_point_cloud (line 42) | def process_point_cloud(self, scan_id, plydata, annotations):
    method scene_proc (line 113) | def scene_proc(self, scan_id):
    method process_scans (line 127) | def process_scans(self):

FILE: preprocess/sceneverse2hmsemantic.py
  function load_semantic_anno (line 10) | def load_semantic_anno(semantic_txt):
  function scene_proc (line 34) | def scene_proc(scene_input):

FILE: preprocess/ssg/relationships/camera.py
  function getLinearEquation (line 5) | def getLinearEquation(p1x, p1y, p2x, p2y):
  function cal_glocal_position (line 16) | def cal_glocal_position(object, floor, distance_rate=1.6):
  function cal_camera_relations (line 34) | def cal_camera_relations(ObjNode_dict, camera_position, camera_view, ins...

FILE: preprocess/ssg/relationships/hanging.py
  function cal_above_below_relationships (line 4) | def cal_above_below_relationships(ObjNode_dict, src, scene_high):
  function filter_labels (line 31) | def filter_labels(obj_label):
  function cal_hanging_relationships (line 40) | def cal_hanging_relationships (ObjNode_dict, no_supported_objs, camera_a...

FILE: preprocess/ssg/relationships/multi_objs.py
  function are_furniture_aligned (line 8) | def are_furniture_aligned(furniture1, furniture2, offset_threshold):
  function find_aligned_furniture (line 41) | def find_aligned_furniture(furniture_list, ObjNode_dict, offset_threshold):
  function furniture_merge_lists (line 59) | def furniture_merge_lists(lists):
  function merge_sublists (line 74) | def merge_sublists(L):
  function find_middle_furniture (line 89) | def find_middle_furniture (proximity_relations, ObjNode_dict):

FILE: preprocess/ssg/relationships/proximity.py
  function get_direction (line 5) | def get_direction(src_obj, tgt_obj):
  function get_oppo_direction (line 35) | def get_oppo_direction(direction):
  function get_space_relations (line 46) | def get_space_relations(src, tgt):
  function get_distance (line 55) | def get_distance(src, tgt):
  function cal_proximity_relationships (line 63) | def cal_proximity_relationships(neighbor_objs_id, camera_angle, ObjNode_...

FILE: preprocess/ssg/relationships/support.py
  function is_supported (line 4) | def is_supported(target_obj, obj, camera_angle, radius_range = 0.1, thre...
  function optimaze_support_loops (line 50) | def optimaze_support_loops(support_relations_dict):
  function cal_support_relations (line 63) | def cal_support_relations(ObjNode_list, camera_angle):

FILE: preprocess/ssg/ssg_data/script/ObjNode.py
  class ObjNode (line 8) | class ObjNode(object):
    method __init__ (line 9) | def __init__(self, id=None, label=None, mesh=None, position=None, size...
    method __str__ (line 20) | def __str__(self):
    method get_object_information (line 23) | def get_object_information(self, dataset):
    method display_obb_box (line 39) | def display_obb_box(self, scene_visible = True):

FILE: preprocess/ssg/ssg_data/ssg_visualize.py
  function vis_dataset (line 6) | def vis_dataset(ObjNode_dict, relation, scene_path, scan_id, scene_center):

FILE: preprocess/ssg/ssg_main.py
  function default_dump (line 21) | def default_dump(obj):
  function convert_pc_to_box (line 30) | def convert_pc_to_box(obj_pc):
  function init_camera_view (line 41) | def init_camera_view():
  function filter_bad_label (line 53) | def filter_bad_label(input_label):
  function get_obj_room_id (line 61) | def get_obj_room_id (org_id):
  function generate_object_info (line 68) | def generate_object_info(save_root, scene_name) :
  function generate_ssg_data (line 112) | def generate_ssg_data(dataset, scene_path, pre_load_path):
  function main (line 132) | def main(cfg):

FILE: preprocess/ssg/ssg_utils.py
  function cw_rotate (line 12) | def cw_rotate(point, ang):
  function euclideanDistance (line 19) | def euclideanDistance(instance1, instance2, dimension):
  function if_inPoly (line 26) | def if_inPoly(polygon, Points):
  function get_Poly_Area (line 32) | def get_Poly_Area(polygon):
  function get_theta (line 38) | def get_theta (x, y):
  function generate_relation (line 55) | def generate_relation(src, tgt, express):
  function visualize_relations (line 82) | def visualize_relations(target_obj, obj, relationship, camera_angle, cam...
  function visualize_relations_multi_objs (line 132) | def visualize_relations_multi_objs(objs, relationship, item, camera_angl...
  function render_bbox_pyvista (line 196) | def render_bbox_pyvista(tgt, src, relationship, camera_angle, camera_pos...
  function visualize_camera_relations (line 276) | def visualize_camera_relations(ObjNode_dict, camera_relations, camera_po...
  function read_one_obj (line 323) | def read_one_obj(bbox_points, scene_file):

FILE: preprocess/structured3d.py
  class S3DProcessor (line 18) | class S3DProcessor(ProcessorBase):
    method record_splits (line 19) | def record_splits(self, scan_ids):
    method read_all_scans (line 33) | def read_all_scans(self):
    method process_point_cloud (line 40) | def process_point_cloud(self, scan_id, plydata, annotations):
    method scene_proc (line 76) | def scene_proc(self, scan_id):
    method process_scans (line 92) | def process_scans(self):

FILE: preprocess/utils/align_utils.py
  function compute_box_3d (line 4) | def compute_box_3d(size, center, rotmat):
  function rotate_z_axis_by_degrees (line 28) | def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True):
  function eulerAnglesToRotationMatrix (line 40) | def eulerAnglesToRotationMatrix(theta):
  function is_axis_aligned (line 79) | def is_axis_aligned(rotated_box, thres=0.05):
  function calc_align_matrix (line 85) | def calc_align_matrix(bbox_list):

FILE: preprocess/utils/constant.py
  class PromptType (line 641) | class PromptType(Enum):

FILE: run.py
  function main (line 13) | def main(cfg):

FILE: trainer/build.py
  class Tracker (line 29) | class Tracker():
    method __init__ (line 30) | def __init__(self, cfg):
    method step (line 33) | def step(self):
    method reset (line 36) | def reset(self, cfg):
    method state_dict (line 41) | def state_dict(self):
    method load_state_dict (line 44) | def load_state_dict(self, state_dict):
  class BaseTrainer (line 48) | class BaseTrainer():
    method __init__ (line 49) | def __init__(self, cfg):
    method forward (line 135) | def forward(self, data_dict):
    method backward (line 138) | def backward(self, loss):
    method log (line 147) | def log(self, results, mode="train"):
    method save (line 160) | def save(self, name):
    method resume (line 164) | def resume(self):
    method load_pretrain (line 174) | def load_pretrain(self):
    method save_func (line 186) | def save_func(self, path):
  function build_trainer (line 190) | def build_trainer(cfg):

FILE: trainer/debug_trainer.py
  class DebugTrainer (line 10) | class DebugTrainer(BaseTrainer):
    method __init__ (line 11) | def __init__(self, cfg):
    method forward (line 15) | def forward(self, data_dict):
    method backward (line 18) | def backward(self, loss):
    method train_step (line 26) | def train_step(self, epoch):
    method eval_step (line 39) | def eval_step(self, epoch):
    method test_step (line 48) | def test_step(self):
    method run (line 56) | def run(self):

FILE: trainer/default_trainer.py
  class DefaultTrainer (line 10) | class DefaultTrainer(BaseTrainer):
    method __init__ (line 11) | def __init__(self, cfg):
    method forward (line 15) | def forward(self, data_dict):
    method backward (line 18) | def backward(self, loss):
    method train_step (line 26) | def train_step(self, epoch):
    method eval_step (line 51) | def eval_step(self, epoch):
    method test_step (line 67) | def test_step(self):
    method run (line 80) | def run(self):

FILE: trainer/objpretrain_trainer.py
  class ObjPretrainTrainer (line 10) | class ObjPretrainTrainer(BaseTrainer):
    method __init__ (line 11) | def __init__(self, cfg):
    method forward (line 15) | def forward(self, data_dict):
    method backward (line 18) | def backward(self, loss):
    method train_step (line 26) | def train_step(self, epoch):
    method eval_step (line 49) | def eval_step(self, epoch):
    method test_step (line 70) | def test_step(self):
    method run (line 87) | def run(self):

FILE: trainer/openvocab_trainer.py
  class OpenVocabTrainer (line 10) | class OpenVocabTrainer(BaseTrainer):
    method __init__ (line 11) | def __init__(self, cfg):
    method forward (line 15) | def forward(self, data_dict):
    method backward (line 18) | def backward(self, loss):
    method train_step (line 26) | def train_step(self, epoch):
    method eval_step (line 49) | def eval_step(self, epoch):
    method test_step (line 65) | def test_step(self):
    method run (line 82) | def run(self):

FILE: visualize_data.py
  function convert_pc_to_box (line 11) | def convert_pc_to_box(obj_pc):
  function load_scan (line 23) | def load_scan(pcd_path, inst2label_path, scene_name):
  function visualize_one_scene (line 31) | def visualize_one_scene(obj_pcds, points, colors, caption):
  function visualize_data (line 47) | def visualize_data(save_root, scene_name, vis_obj=True):
  function visualize_refer (line 70) | def visualize_refer(save_root, anno_file):

Download .json

Condensed preview — 161 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (918K chars).

[
  {
    "path": ".gitignore",
    "chars": 3078,
    "preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
  },
  {
    "path": "DATA.md",
    "chars": 4601,
    "preview": "\n## Data\n\n* Note: As some of our users requested the mapping between HM3D object id in SceneVerse to HM3D-semantics, we "
  },
  {
    "path": "LICENSE",
    "chars": 1068,
    "preview": "MIT License\n\nCopyright (c) 2024 scene-verse\n\nPermission is hereby granted, free of charge, to any person obtaining a cop"
  },
  {
    "path": "README.md",
    "chars": 12017,
    "preview": "<h2 align=\"center\">\n  <span><img src=\"assets/logo025.png\" width=\"4%\" style=\"transform: translate(0,9px)\"></span>\n  <b>Sc"
  },
  {
    "path": "TRAIN.md",
    "chars": 10443,
    "preview": "# Training and Inference\r\n\r\n## Environment Setup\r\nTo install the environment requirements needed for SceneVerse, you can"
  },
  {
    "path": "common/box_utils.py",
    "chars": 2502,
    "preview": "import numpy as np\n\n\ndef box3d_iou(corners1, corners2):\n    ''' Compute 3D bounding box IoU.\n\n    Input:\n        corners"
  },
  {
    "path": "common/dist_utils.py",
    "chars": 6357,
    "preview": "import functools\nimport pickle\nimport torch\nimport torch.distributed as dist\n\nimport logging\nlogger = logging.getLogger("
  },
  {
    "path": "common/io_utils.py",
    "chars": 3716,
    "preview": "import csv\nimport pickle\nimport json\nimport cv2\nimport yaml\nimport numpy as np\nfrom pathlib import Path\nimport torch\nimp"
  },
  {
    "path": "common/launch_utils.py",
    "chars": 4773,
    "preview": "import os\nfrom pathlib import Path\nimport subprocess\n\nimport submitit\n\n\nhuggingface_fix = f\"TRANSFORMERS_OFFLINE=1 CURL_"
  },
  {
    "path": "common/misc.py",
    "chars": 5720,
    "preview": "import os\nimport glob\nimport importlib\nimport functools\nimport torch\nfrom typing import Any\nfrom accelerate.logging impo"
  },
  {
    "path": "common/type_utils.py",
    "chars": 1017,
    "preview": "import torch\n\nfrom omegaconf import OmegaConf\n\n\ndef cfg2dict(cfg):\n    return OmegaConf.to_container(cfg, resolve=True)\n"
  },
  {
    "path": "configs/final/all_anno.yaml",
    "chars": 6194,
    "preview": "###\n# Pretrain with human annotation only\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8"
  },
  {
    "path": "configs/final/all_nomlm.yaml",
    "chars": 6757,
    "preview": "###\n# Pretrain on all data without MLM loss\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu:"
  },
  {
    "path": "configs/final/all_noobj.yaml",
    "chars": 6571,
    "preview": "###\n# Pretrain on all data without object-level alignment\n###\n\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_se"
  },
  {
    "path": "configs/final/all_noscene.yaml",
    "chars": 6774,
    "preview": "###\n# Pretrain on all data without scene-level alignment\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed"
  },
  {
    "path": "configs/final/all_pretrain.yaml",
    "chars": 6886,
    "preview": "###\n# Pretrain on all data with all losses\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: "
  },
  {
    "path": "configs/final/all_pretrain_125.yaml",
    "chars": 6649,
    "preview": "###\n# Pretrain on 12.5% of all data\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode:"
  },
  {
    "path": "configs/final/all_pretrain_25.yaml",
    "chars": 6645,
    "preview": "###\n# Pretrain on 25% of all data\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode: \""
  },
  {
    "path": "configs/final/all_pretrain_50.yaml",
    "chars": 6644,
    "preview": "###\n# Pretrain on 50% of all data\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode: \""
  },
  {
    "path": "configs/final/all_pretrain_75.yaml",
    "chars": 6641,
    "preview": "###\n# Pretrain on 75% all data\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode: \"tra"
  },
  {
    "path": "configs/final/all_pretrain_objcap.yaml",
    "chars": 7071,
    "preview": "###\n# Pretrain on all data adding all object captions\n###\n\n# Experiment general info\nname: \"Debug\"\nrng_seed: 42\nnum_gpu:"
  },
  {
    "path": "configs/final/all_pretrain_objcap_notemplate.yaml",
    "chars": 7114,
    "preview": "###\n# Pretrain on all data without template-based object captions\n###\n\n# Experiment general info\nname: \"OV_w_Cap\"\nrng_se"
  },
  {
    "path": "configs/final/all_pretrain_s3d.yaml",
    "chars": 7168,
    "preview": "###\n# Pretrain on all data with Structured 3D\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gp"
  },
  {
    "path": "configs/final/all_pretrain_unfreeze.yaml",
    "chars": 6767,
    "preview": "###\n# Pretrain on all data with object encoder unfrozen\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed:"
  },
  {
    "path": "configs/final/all_rewrite.yaml",
    "chars": 6581,
    "preview": "###\n# Pretrain on all LLM-refined data only\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu:"
  },
  {
    "path": "configs/final/all_template.yaml",
    "chars": 6433,
    "preview": "###\n# Pretrain on all template-based generated data only\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed"
  },
  {
    "path": "configs/final/all_wo_both.yaml",
    "chars": 6617,
    "preview": "###\n# Pretrain on all data without ScanNet and MultiScan data\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng"
  },
  {
    "path": "configs/final/all_wo_both_125.yaml",
    "chars": 6649,
    "preview": "###\n# Pretrain on 12.5% of all data without ScanNet and MultiScan\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\""
  },
  {
    "path": "configs/final/all_wo_both_25.yaml",
    "chars": 6645,
    "preview": "###\n# Pretrain on 25% of all data without ScanNet and MultiScan\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nr"
  },
  {
    "path": "configs/final/all_wo_both_50.yaml",
    "chars": 6645,
    "preview": "###\n# Pretrain on 50% of all data without ScanNet and MultiScan\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nr"
  },
  {
    "path": "configs/final/all_wo_multiscan.yaml",
    "chars": 6461,
    "preview": "###\n# Pretrain on all data without MultiScan\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu"
  },
  {
    "path": "configs/final/all_wo_scannet.yaml",
    "chars": 6612,
    "preview": "###\n# Pretrain on all data without ScanNet\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: "
  },
  {
    "path": "configs/final/debug.yaml",
    "chars": 7376,
    "preview": "###\n# Debugging\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 1\nmode: \"train\"\nnote: \"\"\n# "
  },
  {
    "path": "configs/final/finetune/multiscan_finetune.yaml",
    "chars": 6512,
    "preview": "###\n# Finetune on MultiScan\n###\n\n# Experiment general info\nname: \"FinalOVFinetune\"\nrng_seed: 42\nnum_gpu: 2\nmode: \"train\""
  },
  {
    "path": "configs/final/finetune/multiscan_woL.yaml",
    "chars": 6608,
    "preview": "###\n# Finetune on MultiScan (unseen language)\n###\n\n# Experiment general info\nname: \"FinalOVFinetune\"\nrng_seed: 42\nnum_gp"
  },
  {
    "path": "configs/final/finetune/nr3d_finetune.yaml",
    "chars": 6850,
    "preview": "###\n# Finetune on Nr3D\n###\n\n# Experiment general info\nname: \"FinalOVFinetune\"\nrng_seed: 42\nnum_gpu: 2\nmode: \"train\"\nnote"
  },
  {
    "path": "configs/final/finetune/scannet_woL.yaml",
    "chars": 7048,
    "preview": "###\n# Finetune on ScanRefer (unseen language)\n###\n\n# Experiment general info\nname: \"FinalOVFinetune\"\nrng_seed: 42\nnum_gp"
  },
  {
    "path": "configs/final/finetune/scanqa_finetune.yaml",
    "chars": 3461,
    "preview": "# Experiment general info\nname: \"OV_ScanQA\"\nrng_seed: 42\nnum_gpu: 2\nmode: \"train\"\nnote: \"\"\n# Choose keywords to feature "
  },
  {
    "path": "configs/final/finetune/scanrefer_finetune.yaml",
    "chars": 6863,
    "preview": "###\n# Finetune on ScanRefer\n###\n\n# Experiment general info\nname: \"FinalOVFinetune\"\nrng_seed: 42\nnum_gpu: 2\nmode: \"train\""
  },
  {
    "path": "configs/final/finetune/sqa3d_finetune.yaml",
    "chars": 3332,
    "preview": "# Experiment general info\nname: \"OV_SQA3D\"\nrng_seed: 42\nnum_gpu: 2\nmode: \"train\"\nnote: \"\"\n# Choose keywords to feature y"
  },
  {
    "path": "configs/final/finetune/sr3d_finetune.yaml",
    "chars": 6851,
    "preview": "###\n# Finetune on Sr3D\n###\n\n# Experiment general info\nname: \"FinalOVFinetune\"\nrng_seed: 42\nnum_gpu: 2\nmode: \"train\"\nnote"
  },
  {
    "path": "configs/final/multiscan_only.yaml",
    "chars": 6774,
    "preview": "###\n# MultiScan pretrain from scratch\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmod"
  },
  {
    "path": "configs/final/nr3d_only.yaml",
    "chars": 6987,
    "preview": "###\n# Nr3D pretrain from scratch\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode: \"t"
  },
  {
    "path": "configs/final/procthor_only.yaml",
    "chars": 7278,
    "preview": "###\n# ProcTHOR pretrain from scratch\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode"
  },
  {
    "path": "configs/final/s3d_only.yaml",
    "chars": 7015,
    "preview": "###\n# Structured3D pretrain from scratch\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\n"
  },
  {
    "path": "configs/final/scanrefer_only.yaml",
    "chars": 7006,
    "preview": "###\n# ScanRefer pretrain from scratch\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmod"
  },
  {
    "path": "configs/final/scanrefer_only_gttest.yaml",
    "chars": 6887,
    "preview": "###\n# ScanRefer pretrain from scratch\n###\n\n# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmod"
  },
  {
    "path": "configs/final/sr3d_only.yaml",
    "chars": 6949,
    "preview": "# Experiment general info\nname: \"FinalOVPretrain\"\nrng_seed: 42\nnum_gpu: 8\nmode: \"train\"\nnote: \"\"\n# Choose keywords to fe"
  },
  {
    "path": "data/__init__.py",
    "chars": 50,
    "preview": "from .datasets import *\nfrom .data_utils import *\n"
  },
  {
    "path": "data/build.py",
    "chars": 3594,
    "preview": "from omegaconf import OmegaConf\nfrom torch.utils.data import DataLoader, default_collate, ConcatDataset\nfrom fvcore.comm"
  },
  {
    "path": "data/data_utils.py",
    "chars": 17643,
    "preview": "import random\nimport csv\nfrom collections import Counter\nimport re\n\nimport numpy as np\nimport torch\n\nfrom data.datasets."
  },
  {
    "path": "data/datasets/__init__.py",
    "chars": 164,
    "preview": "from .scannet import *\nfrom .rscan import *\nfrom .arkitscene import *\nfrom .hm import *\nfrom .multiscan import *\nfrom .p"
  },
  {
    "path": "data/datasets/arkitscene.py",
    "chars": 3462,
    "preview": "import collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.register()\nclass"
  },
  {
    "path": "data/datasets/base.py",
    "chars": 50926,
    "preview": "import os\nimport copy\nimport json\nimport jsonlines\nimport random\n\nfrom tqdm import tqdm\nimport numpy as np\nfrom scipy im"
  },
  {
    "path": "data/datasets/constant.py",
    "chars": 16398,
    "preview": "### ScanNet200 Benchmark constants ###\nVALID_CLASS_IDS_200 = (\n    1,\n    2,\n    3,\n    4,\n    5,\n    6,\n    7,\n    8,\n "
  },
  {
    "path": "data/datasets/data_augmentor.py",
    "chars": 11823,
    "preview": "from functools import partial\n\nimport math\nimport numpy as np\nimport torch\n\n\nclass DataAugmentor(object):\n    def __init"
  },
  {
    "path": "data/datasets/dataset_wrapper.py",
    "chars": 10924,
    "preview": "import random\n\nimport torch\nfrom fvcore.common.registry import Registry\nfrom transformers import BertTokenizer, T5Tokeni"
  },
  {
    "path": "data/datasets/hm.py",
    "chars": 3372,
    "preview": "import collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.register()\nclass"
  },
  {
    "path": "data/datasets/multiscan.py",
    "chars": 3449,
    "preview": "import collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.register()\nclass"
  },
  {
    "path": "data/datasets/procthor.py",
    "chars": 3436,
    "preview": "import collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.register()\nclass"
  },
  {
    "path": "data/datasets/rscan.py",
    "chars": 3427,
    "preview": "import collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.register()\nclass"
  },
  {
    "path": "data/datasets/scannet.py",
    "chars": 3460,
    "preview": "import os\nimport collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.regist"
  },
  {
    "path": "data/datasets/scannet_base.py",
    "chars": 31162,
    "preview": "import os\nimport collections\nimport json\nimport pickle\nimport random\n\nimport jsonlines\nfrom tqdm import tqdm\nfrom scipy "
  },
  {
    "path": "data/datasets/scannet_old.py",
    "chars": 23261,
    "preview": "import os\nimport collections\nimport json\nimport random\n\nimport jsonlines\nfrom tqdm import tqdm\nimport numpy as np\nimport"
  },
  {
    "path": "data/datasets/structure3d.py",
    "chars": 3447,
    "preview": "import collections\n\nfrom ..build import DATASET_REGISTRY\nfrom .base import ScanBase\n\n\n@DATASET_REGISTRY.register()\nclass"
  },
  {
    "path": "evaluator/__init__.py",
    "chars": 169,
    "preview": "from .pretrain_eval import *\nfrom .referit3d_eval import *\nfrom .scanrefer_eval import *\nfrom .scanqa_eval import *\nfrom"
  },
  {
    "path": "evaluator/build.py",
    "chars": 2866,
    "preview": "import json\nimport numpy as np\nfrom omegaconf import open_dict\nfrom fvcore.common.registry import Registry\n\nfrom common."
  },
  {
    "path": "evaluator/objcls_eval.py",
    "chars": 941,
    "preview": "import torch\nfrom pathlib import Path\n\nfrom evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator\n\n\n@EVALUATOR_REGIST"
  },
  {
    "path": "evaluator/pretrain_eval.py",
    "chars": 5507,
    "preview": "import torch\nimport numpy as np\n\nfrom evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator\n\n\n@EVALUATOR_REGISTRY.reg"
  },
  {
    "path": "evaluator/referit3d_eval.py",
    "chars": 3622,
    "preview": "from pathlib import Path\nimport torch\n\nfrom evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator\n\n\n@EVALUATOR_REGIST"
  },
  {
    "path": "evaluator/scanqa_eval.py",
    "chars": 4073,
    "preview": "import os\nimport json\nimport collections\nfrom pathlib import Path\nimport torch\n\nfrom evaluator.build import EVALUATOR_RE"
  },
  {
    "path": "evaluator/scanrefer_eval.py",
    "chars": 4064,
    "preview": "from pathlib import Path\nimport torch\n\nfrom evaluator.build import EVALUATOR_REGISTRY, BaseEvaluator\n\n\n@EVALUATOR_REGIST"
  },
  {
    "path": "evaluator/sqa3d_eval.py",
    "chars": 5854,
    "preview": "import os\nimport json\nimport collections\nfrom pathlib import Path\n\nimport numpy as np\nimport torch\n\nfrom data.data_utils"
  },
  {
    "path": "launch.py",
    "chars": 2985,
    "preview": "import argparse\n\nimport common.launch_utils as lu\n\n\ndef parse_args():\n    def str2bool(v):\n        if v.lower() in ('yes"
  },
  {
    "path": "model/__init__.py",
    "chars": 47,
    "preview": "from .objcls import *\nfrom .openvocab import *\n"
  },
  {
    "path": "model/build.py",
    "chars": 408,
    "preview": "import torch.nn as nn\nfrom fvcore.common.registry import Registry\n\n\nMODEL_REGISTRY = Registry(\"model\")\n\n\nclass BaseModel"
  },
  {
    "path": "model/objcls.py",
    "chars": 4539,
    "preview": "import torch\nimport torch.nn as nn\nimport json\nfrom pathlib import Path\n\nimport clip\nfrom transformers import BertConfig"
  },
  {
    "path": "model/openvocab.py",
    "chars": 14787,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nfrom einops import einsum\n\nfrom model.build import MODEL_REGISTRY,"
  },
  {
    "path": "modules/__init__.py",
    "chars": 91,
    "preview": "from .language import *\nfrom .vision import *\nfrom .grounding import *\nfrom .heads import *"
  },
  {
    "path": "modules/build.py",
    "chars": 1267,
    "preview": "from fvcore.common.registry import Registry\n\nfrom common.type_utils import cfg2dict\n\n\nVISION_REGISTRY = Registry(\"vision"
  },
  {
    "path": "modules/grounding/__init__.py",
    "chars": 31,
    "preview": "from .unified_encoder import *\n"
  },
  {
    "path": "modules/grounding/unified_encoder.py",
    "chars": 7573,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom modules.build import GROUNDING_REGISTRY\nfrom modules.layers.transformers import"
  },
  {
    "path": "modules/heads/__init__.py",
    "chars": 81,
    "preview": "from .grounding_head import *\nfrom .pretrain_head import *\nfrom .qa_head import *"
  },
  {
    "path": "modules/heads/grounding_head.py",
    "chars": 2112,
    "preview": "import torch.nn as nn\n\nfrom modules.build import HEADS_REGISTRY\nfrom modules.utils import get_mlp_head\n\n\n@HEADS_REGISTRY"
  },
  {
    "path": "modules/heads/pretrain_head.py",
    "chars": 2101,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom modules.build import HEADS_REGISTRY\nfrom modules.utils import get_activation_fn"
  },
  {
    "path": "modules/heads/qa_head.py",
    "chars": 3113,
    "preview": "import torch\nimport torch.nn.functional as F\nfrom torch import nn\n\nfrom modules.build import HEADS_REGISTRY\n\n\nclass FC(n"
  },
  {
    "path": "modules/language/__init__.py",
    "chars": 39,
    "preview": "from .bert import *\nfrom .clip import *"
  },
  {
    "path": "modules/language/bert.py",
    "chars": 941,
    "preview": "import torch.nn as nn\nfrom transformers import BertConfig, BertModel, BertTokenizer\n\nfrom modules.build import LANGUAGE_"
  },
  {
    "path": "modules/language/clip.py",
    "chars": 1317,
    "preview": "from contextlib import nullcontext\nimport torch\nimport torch.nn as nn\nfrom transformers import CLIPTextModelWithProjecti"
  },
  {
    "path": "modules/layers/pointnet.py",
    "chars": 1953,
    "preview": "import torch.nn as nn\n\nfrom modules.third_party.pointnet2.pointnet2_modules import PointnetSAModule\n\n\ndef break_up_pc(pc"
  },
  {
    "path": "modules/layers/transformers.py",
    "chars": 12930,
    "preview": "from typing import Optional\n\nimport einops\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom torch im"
  },
  {
    "path": "modules/third_party/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/include/ball_query.h",
    "chars": 163,
    "preview": "#pragma once\n#include <torch/extension.h>\n\nat::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/include/cuda_utils.h",
    "chars": 1303,
    "preview": "#ifndef _CUDA_UTILS_H\n#define _CUDA_UTILS_H\n\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n#include <cmath>\n"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/include/group_points.h",
    "chars": 183,
    "preview": "#pragma once\n#include <torch/extension.h>\n\nat::Tensor group_points(at::Tensor points, at::Tensor idx);\nat::Tensor group_"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/include/interpolate.h",
    "chars": 386,
    "preview": "#pragma once\n\n#include <torch/extension.h>\n#include <vector>\n\nstd::vector<at::Tensor> three_nn(at::Tensor unknowns, at::"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/include/sampling.h",
    "chars": 260,
    "preview": "#pragma once\n#include <torch/extension.h>\n\nat::Tensor gather_points(at::Tensor points, at::Tensor idx);\nat::Tensor gathe"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/include/utils.h",
    "chars": 983,
    "preview": "#pragma once\n#include <ATen/cuda/CUDAContext.h>\n#include <torch/extension.h>\n\n#define CHECK_CUDA(x)                     "
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/ball_query.cpp",
    "chars": 1037,
    "preview": "#include \"ball_query.h\"\n#include \"utils.h\"\n\nvoid query_ball_point_kernel_wrapper(int b, int n, int m, float radius,\n    "
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/ball_query_gpu.cu",
    "chars": 1784,
    "preview": "#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"cuda_utils.h\"\n\n// input: new_xyz(b, m, 3) xyz(b, n, "
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/bindings.cpp",
    "chars": 570,
    "preview": "#include \"ball_query.h\"\n#include \"group_points.h\"\n#include \"interpolate.h\"\n#include \"sampling.h\"\n\nPYBIND11_MODULE(TORCH_"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/group_points.cpp",
    "chars": 1952,
    "preview": "#include \"group_points.h\"\n#include \"utils.h\"\n\nvoid group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsa"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/group_points_gpu.cu",
    "chars": 2885,
    "preview": "#include <stdio.h>\n#include <stdlib.h>\n\n#include \"cuda_utils.h\"\n\n// input: points(b, c, n) idx(b, npoints, nsample)\n// o"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/interpolate.cpp",
    "chars": 3304,
    "preview": "#include \"interpolate.h\"\n#include \"utils.h\"\n\nvoid three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,\n   "
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/interpolate_gpu.cu",
    "chars": 5141,
    "preview": "#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"cuda_utils.h\"\n\n// input: unknown(b, n, 3) known(b, m"
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/sampling.cpp",
    "chars": 2894,
    "preview": "#include \"sampling.h\"\n#include \"utils.h\"\n\nvoid gather_points_kernel_wrapper(int b, int c, int n, int npoints,\n          "
  },
  {
    "path": "modules/third_party/pointnet2/_ext_src/src/sampling_gpu.cu",
    "chars": 7019,
    "preview": "#include <stdio.h>\n#include <stdlib.h>\n\n#include \"cuda_utils.h\"\n\n// input: points(b, c, n) idx(b, m)\n// output: out(b, c"
  },
  {
    "path": "modules/third_party/pointnet2/_version.py",
    "chars": 22,
    "preview": "__version__ = \"3.0.0\"\n"
  },
  {
    "path": "modules/third_party/pointnet2/pointnet2_modules.py",
    "chars": 17717,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# \n# This source code is licensed under the MIT license found in the\n"
  },
  {
    "path": "modules/third_party/pointnet2/pointnet2_test.py",
    "chars": 1011,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# \n# This source code is licensed under the MIT license found in the\n"
  },
  {
    "path": "modules/third_party/pointnet2/pointnet2_utils.py",
    "chars": 12079,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# \n# This source code is licensed under the MIT license found in the\n"
  },
  {
    "path": "modules/third_party/pointnet2/pytorch_utils.py",
    "chars": 7501,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\n# \n# This source code is licensed under the MIT license found in the\n"
  },
  {
    "path": "modules/third_party/pointnet2/requirements_new.txt",
    "chars": 2870,
    "preview": "accelerate==0.28.0\naddict==2.4.0\nantlr4-python3-runtime==4.9.3\nappdirs==1.4.4\nasttokens==2.4.1\nattrs==23.2.0\nblinker==1."
  },
  {
    "path": "modules/third_party/pointnet2/setup.py",
    "chars": 1089,
    "preview": "import glob\nimport os\nimport os.path as osp\n\nfrom setuptools import find_packages, setup\nfrom torch.utils.cpp_extension "
  },
  {
    "path": "modules/utils.py",
    "chars": 6832,
    "preview": "import copy\n\nimport einops\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\n########################"
  },
  {
    "path": "modules/vision/__init__.py",
    "chars": 68,
    "preview": "from .pcd_openvocab_encoder import *\nfrom .obj_cls_encoder import *\n"
  },
  {
    "path": "modules/vision/obj_cls_encoder.py",
    "chars": 523,
    "preview": "import torch.nn as nn\nfrom modules.build import VISION_REGISTRY\nfrom modules.utils import get_mlp_head\n\n@VISION_REGISTRY"
  },
  {
    "path": "modules/vision/pcd_openvocab_encoder.py",
    "chars": 8668,
    "preview": "import os\nimport glob\n\nimport einops\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom modules.bu"
  },
  {
    "path": "modules/weights.py",
    "chars": 813,
    "preview": "import torch.nn as nn\n\ndef _init_weights_bert(module, std=0.02):\n    \"\"\"\n        Huggingface transformer weight initiali"
  },
  {
    "path": "optim/__init__.py",
    "chars": 19,
    "preview": "from .loss import *"
  },
  {
    "path": "optim/build.py",
    "chars": 329,
    "preview": "from optim.loss.loss import Loss\nfrom optim.optimizer.optim import get_optimizer\nfrom optim.scheduler import get_schedul"
  },
  {
    "path": "optim/loss/__init__.py",
    "chars": 27,
    "preview": "from .contra_loss import *\n"
  },
  {
    "path": "optim/loss/contra_loss.py",
    "chars": 4594,
    "preview": "import numpy as np\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom optim.loss.loss import LOSS_R"
  },
  {
    "path": "optim/loss/loss.py",
    "chars": 5885,
    "preview": "import torch.nn as nn\nimport torch.nn.functional as F\nfrom fvcore.common.registry import Registry\n\n\nLOSS_REGISTRY = Regi"
  },
  {
    "path": "optim/optimizer/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "optim/optimizer/optim.py",
    "chars": 468,
    "preview": "import torch.optim as optim\n\nfrom fvcore.common.registry import Registry\nOPTIM_REGISTRY = Registry(\"loss\")\n\nfrom common."
  },
  {
    "path": "optim/scheduler.py",
    "chars": 950,
    "preview": "import math\nfrom torch.optim.lr_scheduler import LambdaLR\n\n\ndef warmup_cosine(step, warmup_step, total_step, minimum_rat"
  },
  {
    "path": "optim/utils.py",
    "chars": 606,
    "preview": "def no_decay_param_group(parameters, lr):\n    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n    decay_params"
  },
  {
    "path": "preprocess/README.md",
    "chars": 3297,
    "preview": "## Data Processing\n\nWe have released the preprocessing scripts for 3RScan, MultiScan, ARKitScenes and Structured3D. They"
  },
  {
    "path": "preprocess/__init__.py",
    "chars": 115,
    "preview": "from .build import *\nfrom .utils import *\nfrom .rscan import *\nfrom .multiscan import *\nfrom .arkitscenes import *\n"
  },
  {
    "path": "preprocess/arkitscenes.py",
    "chars": 4970,
    "preview": "import json\nfrom glob import glob\nfrom omegaconf import OmegaConf\nfrom joblib import Parallel, delayed, parallel_backend"
  },
  {
    "path": "preprocess/build.py",
    "chars": 1791,
    "preview": "from pathlib import Path\nfrom fvcore.common.registry import Registry\n\nPROCESSOR_REGISTRY = Registry(\"Processor\")\n\n\nclass"
  },
  {
    "path": "preprocess/multiscan.py",
    "chars": 5536,
    "preview": "import re\nimport json\nfrom glob import glob\nfrom omegaconf import OmegaConf\nfrom joblib import Parallel, delayed, parall"
  },
  {
    "path": "preprocess/rscan.py",
    "chars": 6007,
    "preview": "import json\nfrom glob import glob\nfrom omegaconf import OmegaConf\nfrom joblib import Parallel, delayed, parallel_backend"
  },
  {
    "path": "preprocess/sceneverse2hmsemantic.py",
    "chars": 2237,
    "preview": "import os\nimport json\nfrom joblib import Parallel, delayed, parallel_backend\nfrom glob import glob\nfrom tqdm import tqdm"
  },
  {
    "path": "preprocess/ssg/README.md",
    "chars": 685,
    "preview": "## Scene Graph Generation\n\nWe have released the scripts to generate 3D scene graphs for the datasets released in SceneVe"
  },
  {
    "path": "preprocess/ssg/relationships/camera.py",
    "chars": 2210,
    "preview": "import numpy as np\nimport ssg_utils as utils\n\n\ndef getLinearEquation(p1x, p1y, p2x, p2y):\n    sign = 1\n    a = p2y - p1y"
  },
  {
    "path": "preprocess/ssg/relationships/hanging.py",
    "chars": 1752,
    "preview": "import ssg_utils as utils\n\n\ndef cal_above_below_relationships(ObjNode_dict, src, scene_high):\n\n    above_below_relations"
  },
  {
    "path": "preprocess/ssg/relationships/init.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "preprocess/ssg/relationships/multi_objs.py",
    "chars": 4042,
    "preview": "import numpy as np\nimport networkx as nx\nimport itertools\n\nimport ssg_utils as utils\n\n\ndef are_furniture_aligned(furnitu"
  },
  {
    "path": "preprocess/ssg/relationships/proximity.py",
    "chars": 3974,
    "preview": "import numpy as np\nimport itertools\nimport ssg_utils as utils\n\ndef get_direction(src_obj, tgt_obj):\n\n    sx, sy = src_ob"
  },
  {
    "path": "preprocess/ssg/relationships/support.py",
    "chars": 3137,
    "preview": "from ssg_data.dictionary import always_supported, hanging\nimport ssg_utils as utils\n\ndef is_supported(target_obj, obj, c"
  },
  {
    "path": "preprocess/ssg/ssg_data/dictionary.py",
    "chars": 1499,
    "preview": "hanging = ['window', 'curtain', 'curtains', 'shower curtain', 'curtain rod', 'shower curtain rod']\n\nalways_supported = ["
  },
  {
    "path": "preprocess/ssg/ssg_data/script/ObjNode.py",
    "chars": 4647,
    "preview": "import networkx as nx\nimport trimesh\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pyvista as pv\n\n\nclass Obj"
  },
  {
    "path": "preprocess/ssg/ssg_data/ssg_visualize.py",
    "chars": 2674,
    "preview": "import open3d as o3d\nimport numpy as np\nimport torch\n\n\ndef vis_dataset(ObjNode_dict, relation, scene_path, scan_id, scen"
  },
  {
    "path": "preprocess/ssg/ssg_main.py",
    "chars": 13610,
    "preview": "import json\nimport pickle\nfrom tqdm import tqdm\nfrom pathlib import Path\nfrom omegaconf import OmegaConf\n\nimport torch\ni"
  },
  {
    "path": "preprocess/ssg/ssg_utils.py",
    "chars": 11102,
    "preview": "import trimesh\nimport math\nfrom shapely import geometry\nimport os\nimport numpy as np\nimport pyvista as pv\nfrom ssg_data."
  },
  {
    "path": "preprocess/structured3d.py",
    "chars": 4466,
    "preview": "import pickle\nfrom glob import glob\nfrom omegaconf import OmegaConf\nfrom joblib import Parallel, delayed, parallel_backe"
  },
  {
    "path": "preprocess/utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "preprocess/utils/align_utils.py",
    "chars": 3247,
    "preview": "import numpy as np\nimport math\n\ndef compute_box_3d(size, center, rotmat):\n    \"\"\"Compute corners of a single box from ro"
  },
  {
    "path": "preprocess/utils/constant.py",
    "chars": 17572,
    "preview": "from enum import Enum\nimport numpy as np\nimport matplotlib.pyplot as plt\n\n### ScanNet200 Benchmark constants ###\nVALID_C"
  },
  {
    "path": "preprocess/utils/label_convert.py",
    "chars": 62598,
    "preview": "ARKITSCENE_SCANNET= {\n'bed': 'bed',\n'cabinet': 'cabinet',\n'refrigerator': 'refrigerator',\n'table': 'table',\n'chair': 'ch"
  },
  {
    "path": "requirements.txt",
    "chars": 2510,
    "preview": "git+https://github.com/openai/CLIP.git\ngit+https://github.com/facebookresearch/fvcore\naccelerate==0.28.0\naddict==2.4.0\na"
  },
  {
    "path": "run.py",
    "chars": 2004,
    "preview": "from pathlib import Path\nimport hydra\nfrom datetime import datetime\nfrom omegaconf import OmegaConf, open_dict\nimport wa"
  },
  {
    "path": "trainer/__init__.py",
    "chars": 127,
    "preview": "from .default_trainer import *\nfrom .openvocab_trainer import *\nfrom .objpretrain_trainer import *\nfrom .debug_trainer i"
  },
  {
    "path": "trainer/build.py",
    "chars": 8068,
    "preview": "import copy as cp\nimport glob\nfrom datetime import timedelta\nfrom pathlib import Path\nfrom omegaconf import OmegaConf\nfr"
  },
  {
    "path": "trainer/debug_trainer.py",
    "chars": 2384,
    "preview": "import copy\nfrom tqdm import tqdm\n\nimport torch\nfrom trainer.build import TRAINER_REGISTRY\nfrom trainer.build import Bas"
  },
  {
    "path": "trainer/default_trainer.py",
    "chars": 3972,
    "preview": "import copy\nfrom tqdm import tqdm\n\nimport torch\nfrom trainer.build import TRAINER_REGISTRY\nfrom trainer.build import Bas"
  },
  {
    "path": "trainer/objpretrain_trainer.py",
    "chars": 4319,
    "preview": "import copy\nfrom tqdm import tqdm\n\nimport torch\nfrom trainer.build import TRAINER_REGISTRY\nfrom trainer.build import Bas"
  },
  {
    "path": "trainer/openvocab_trainer.py",
    "chars": 4694,
    "preview": "import copy\nfrom tqdm import tqdm\n\nimport torch\nfrom trainer.build import TRAINER_REGISTRY\nfrom trainer.build import Bas"
  },
  {
    "path": "visualize_data.py",
    "chars": 4836,
    "preview": "import argparse\nimport random\nimport json\nfrom pathlib import Path\n\nimport numpy as np\nimport torch\nimport open3d as o3d"
  }
]

About this extraction

This page contains the full source code of the scene-verse/SceneVerse GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 161 files (848.7 KB), approximately 244.7k tokens, and a symbol index with 606 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo