Showing preview only (280K chars total). Download the full file or copy to clipboard to get everything.
Repository: jamycheung/DELIVER
Branch: main
Commit: dd6a5aacff3f
Files: 58
Total size: 262.6 KB
Directory structure:
gitextract_aaso4r_f/
├── .gitignore
├── LICENSE
├── README.md
├── configs/
│ ├── deliver_rgbdel.yaml
│ ├── kitti360_rgbdel.yaml
│ ├── mcubes_rgbadn.yaml
│ ├── mfnet_rgbt.yaml
│ ├── nyu_rgbd.yaml
│ └── urbanlf.yaml
├── environment.yaml
├── requirements.txt
├── semseg/
│ ├── __init__.py
│ ├── augmentations.py
│ ├── augmentations_mm.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── deliver.py
│ │ ├── kitti360.py
│ │ ├── mcubes.py
│ │ ├── mfnet.py
│ │ ├── nyu.py
│ │ ├── unzip.py
│ │ └── urbanlf.py
│ ├── losses.py
│ ├── metrics.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── cmnext.py
│ │ │ └── cmx.py
│ │ ├── base.py
│ │ ├── cmnext.py
│ │ ├── cmx.py
│ │ ├── heads/
│ │ │ ├── __init__.py
│ │ │ ├── condnet.py
│ │ │ ├── fapn.py
│ │ │ ├── fcn.py
│ │ │ ├── fpn.py
│ │ │ ├── hem.py
│ │ │ ├── lawin.py
│ │ │ ├── segformer.py
│ │ │ ├── sfnet.py
│ │ │ └── upernet.py
│ │ ├── layers/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ └── initialize.py
│ │ └── modules/
│ │ ├── __init__.py
│ │ ├── crossatt.py
│ │ ├── ffm.py
│ │ ├── mspa.py
│ │ ├── ppm.py
│ │ └── psa.py
│ ├── optimizers.py
│ ├── schedulers.py
│ └── utils/
│ ├── __init__.py
│ ├── utils.py
│ └── visualize.py
└── tools/
├── infer_mm.py
├── train_mm.py
└── val_mm.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Repo-specific GitIgnore ----------------------------------------------------------------------------------------------
*.jpg
*.jpeg
*.png
*.bmp
*.tif
*.tiff
*.heic
*.JPG
*.JPEG
*.PNG
*.BMP
*.TIF
*.TIFF
*.HEIC
*.mp4
*.mov
*.MOV
*.avi
*.data
*.json
*.pth
*.cfg
!cfg/yolov3*.cfg
storage.googleapis.com
runs/*
data/*
!data/images/zidane.jpg
!data/images/bus.jpg
!data/coco.names
!data/coco_paper.names
!data/coco.data
!data/coco_*.data
!data/coco_*.txt
!data/trainvalno5k.shapes
!data/*.sh
test.py
test_imgs/
pycocotools/*
results*.txt
gcp_test*.sh
checkpoints/
# output/
# output*/
*events*
assests/*/
# Datasets -------------------------------------------------------------------------------------------------------------
coco/
coco128/
VOC/
# MATLAB GitIgnore -----------------------------------------------------------------------------------------------------
*.m~
*.mat
!targets*.mat
# Neural Network weights -----------------------------------------------------------------------------------------------
*.weights
*.pt
*.onnx
*.mlmodel
*.torchscript
darknet53.conv.74
yolov3-tiny.conv.15
# GitHub Python GitIgnore ----------------------------------------------------------------------------------------------
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
wandb/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
# *.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
.env
# virtualenv
.venv*
venv*/
ENV*/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# https://github.com/github/gitignore/blob/master/Global/macOS.gitignore -----------------------------------------------
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
Icon?
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff:
.idea/*
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries
.html # Bokeh Plots
.pg # TensorFlow Frozen Graphs
.avi # videos
# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
# Gradle:
.idea/**/gradle.xml
.idea/**/libraries
# CMake
cmake-build-debug/
cmake-build-release/
# Mongo Explorer plugin:
.idea/**/mongoSettings.xml
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
output/
data/
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [2023] [Jiaming Zhang]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
<div align="center">
## Delivering Arbitrary-Modal Semantic Segmentation (CVPR 2023)
</div>
<p align="center">
<a href="https://arxiv.org/pdf/2303.01480.pdf">
<img src="https://img.shields.io/badge/arXiv-2303.01480-red" /></a>
<a href="https://jamycheung.github.io/DELIVER.html">
<img src="https://img.shields.io/badge/Project-page-green" /></a>
<a href="https://www.youtube.com/watch?v=X-VeSLsEToA">
<img src="https://img.shields.io/badge/Video-YouTube-%23FF0000.svg" /></a>
<a href="https://pytorch.org/">
<img src="https://img.shields.io/badge/Framework-PyTorch-orange.svg" /></a>
<a href="https://github.com/jamycheung/DELIVER/blob/main/LICENSE">
<img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" /></a>
</p>
[](https://paperswithcode.com/sota/semantic-segmentation-on-deliver?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-kitti-360?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-nyu-depth-v2?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/thermal-image-segmentation-on-mfn-dataset?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-mcubes?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-mcubes-p?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-urbanlf?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/thermal-image-segmentation-on-noisy-rs-rgb-t?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-ddd17?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-dsec?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-bjroad?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-tlcgis?p=delivering-arbitrary-modal-semantic)
[](https://paperswithcode.com/sota/semantic-segmentation-on-porto?p=delivering-arbitrary-modal-semantic)
## Introduction
To conduct arbitrary-modal semantic segmentation, we create **DeLiVER** benchmark, covering **De**pth, **Li**DAR, multiple **V**iews, **E**vents, and **R**GB. It has four *severe weather conditions* as well as five *sensor failure cases* to exploit modal complementarity and resolve partial outages. Besides, we present the arbitrary cross-modal segmentation model **CMNeXt**, allowing to scale from 1 to 81 modalities on the DeLiVER, KITTI-360, MFNet, NYU Depth V2, UrbanLF, and MCubeS datasets.
For more details, please check our [arXiv](https://arxiv.org/pdf/2303.01480.pdf) paper.
## Updates
- [x] 03/2023, init repository.
- [x] 04/2023, release front-view DeLiVER. Download from [**GoogleDrive**](https://drive.google.com/file/d/1P-glCmr-iFSYrzCfNawgVI9qKWfP94pm/view?usp=share_link).
- [x] 04/2023, release CMNeXt model weights. Download from [**GoogleDrive**](https://drive.google.com/drive/folders/1MZaaZ5_rEVSjns3TBM0UDt6IW4X-HPIN?usp=share_link).
## DeLiVER dataset


DeLiVER multimodal dataset including (a) four adverse conditions out of five conditions(**cloudy, foggy, night-time, rainy and sunny**). Apart from normal cases, each condition has five corner cases (**MB: Motion Blur; OE: Over-Exposure; UE: Under-Exposure; LJ: LiDAR-Jitter; and EL: Event Low-resolution**). Each sample has six views. Each view has four modalities and two labels (semantic and instance). (b) is the data statistics. (c) is the data distribution of 25 semantic classes.
### DELIVER splitting

### Data folder structure
**Download DELIVER dataset from [**GoogleDrive**](https://drive.google.com/file/d/1P-glCmr-iFSYrzCfNawgVI9qKWfP94pm/view?usp=share_link) (~12.2 GB)**.
The `data/DELIVER` folder is structured as:
```text
DELIVER
├── depth
│ ├── cloud
│ │ ├── test
│ │ │ ├── MAP_10_point102
│ │ │ │ ├── 045050_depth_front.png
│ │ │ │ ├── ...
│ │ ├── train
│ │ └── val
│ ├── fog
│ ├── night
│ ├── rain
│ └── sun
├── event
├── hha
├── img
├── lidar
└── semantic
```
## CMNeXt model

CMNeXt architecture in Hub2Fuse paradigm and asymmetric branches, having e.g., Multi-Head Self-Attention (MHSA) blocks in the RGB branch and our Parallel Pooling Mixer (PPX) blocks in the accompanying branch. At the hub step, the Self-Query Hub selects informative features from the supplementary modalities. At the fusion step, the feature rectification module (FRM) and feature fusion module (FFM) are used for feature fusion. Between stages, features of each modality are restored via adding the fused feature. The four-stage fused features are forwarded to the segmentation head for the final prediction.
## Environment
```bash
conda env create -f environment.yml
conda activate cmnext
# Optional: install apex follow: https://github.com/NVIDIA/apex
```
## Data preparation
Prepare six datasets:
- [DELIVER](https://github.com/jamycheung/DELIVER), for RGB-Depth-Event-LiDAR semantic segmentation.
- [KITTI-360](https://www.cvlibs.net/datasets/kitti-360/), for RGB-Depth-Event-LiDAR semantic segmentation.
- [NYU Depth V2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html), for RGB-Depth semantic segmentation.
- [MFNet](https://github.com/haqishen/MFNet-pytorch), for RGB-Thermal semantic segmentation.
- [UrbanLF](https://github.com/HAWKEYE-Group/UrbanLF), for light-filed segmentation based on sub-aperture images.
- [MCubeS](https://github.com/kyotovision-public/multimodal-material-segmentation), for multimodal material segmentation with RGB-A-D-N modalities.
Then, all datasets are structured as:
```
data/
├── DELIVER
│ ├── img
│ ├── hha
│ ├── event
│ ├── lidar
│ └── semantic
├── KITTI-360
│ ├── data_2d_raw
│ ├── data_2d_hha
│ ├── data_2d_event
│ ├── data_2d_lidar
│ └── data_2d_semantics
├── NYUDepthv2
│ ├── RGB
│ ├── HHA
│ └── Label
├── MFNet
│ ├── rgb
│ ├── ther
│ └── labels
├── UrbanLF
│ ├── Syn
│ └── real
├── MCubeS
│ ├── polL_color
│ ├── polL_aolp
│ ├── polL_dolp
│ ├── NIR_warped
│ └── SS
```
*For RGB-Depth, the [HHA format](https://github.com/charlesCXK/Depth2HHA-python) is generated from depth image.*
## Model Zoo
### DELIVER dataset
| Model-Modal | #Params(M) | GFLOPs | mIoU | weight |
| :--------------- | :---------- | :----- | :----- | :------ |
| CMNeXt-RGB | 25.79 | 38.93 | 57.20 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
| CMNeXt-RGB-E | 58.69 | 62.94 | 57.48 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
| CMNeXt-RGB-L | 58.69 | 62.94 | 58.04 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
| CMNeXt-RGB-D | 58.69 | 62.94 | 63.58 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
| CMNeXt-RGB-D-E | 58.72 | 64.19 | 64.44 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
| CMNeXt-RGB-D-L | 58.72 | 64.19 | 65.50 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
| CMNeXt-RGB-D-E-L | 58.73 | 65.42 | 66.30 | [GoogleDrive](https://drive.google.com/drive/folders/1OWteEOrjfrC3VNg3sxJFZPHz9urkZ3lm?usp=share_link) |
### KITTI360 dataset
| Model-Modal | mIoU | weight |
| :--------------- | :----- | :------ |
| CMNeXt-RGB | 67.04 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
| CMNeXt-RGB-E | 66.13 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
| CMNeXt-RGB-L | 65.26 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
| CMNeXt-RGB-D | 65.09 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
| CMNeXt-RGB-D-E | 67.73 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
| CMNeXt-RGB-D-L | 66.55 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
| CMNeXt-RGB-D-E-L | 67.84 | [GoogleDrive](https://drive.google.com/drive/folders/1OXfBNNcwMCQWGmvOqvkAHd8xViKJ6djf?usp=share_link) |
### NYU Depth V2
| Model-Modal | mIoU | weight |
| :--------------- | :----- | :------ |
| CMNeXt-RGB-D (MiT-B4) | 56.9 | [GoogleDrive](https://drive.google.com/drive/folders/1OXrMv1Mi6E-vyedlkNpfc5ibJeJDfwqq?usp=share_link) |
### MFNet
| Model-Modal | mIoU | weight |
| :--------------- | :----- | :------ |
| CMNeXt-RGB-D (MiT-B4) | 59.9 | [GoogleDrive](https://drive.google.com/drive/folders/1OaOHMbD5P_HPwTzzXwdYRUFIiGkyFQfP?usp=share_link) |
### UrbanLF
There are **real** and **synthetic** datasets.
| Model-Modal | Real | weight | Syn | weight |
| :-------------- | :----- | :----- | :----- | :----- |
| CMNeXt-RGB | 82.20 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) | 78.53 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) |
| CMNeXt-RGB-LF8 | 83.22 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) | 80.74 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) |
| CMNeXt-RGB-LF33 | 82.62 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) | 80.98 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) |
| CMNeXt-RGB-LF80 | 83.11 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) | 81.02 | [GoogleDrive](https://drive.google.com/drive/folders/1OfepYOYaM8I0itjuHK4csqmAuu_zHol-?usp=share_link) |
### MCubeS
| Model-Modal | mIoU | weight |
| :--------------- | :----- | :----- |
| CMNeXt-RGB | 48.16 | [GoogleDrive](https://drive.google.com/drive/folders/1OgbqpT6TSCPPsoJn0sh2wfXNy5mL5nk9?usp=share_link) |
| CMNeXt-RGB-A | 48.42 | [GoogleDrive](https://drive.google.com/drive/folders/1OgbqpT6TSCPPsoJn0sh2wfXNy5mL5nk9?usp=share_link) |
| CMNeXt-RGB-A-D | 49.48 | [GoogleDrive](https://drive.google.com/drive/folders/1OgbqpT6TSCPPsoJn0sh2wfXNy5mL5nk9?usp=share_link) |
| CMNeXt-RGB-A-D-N | 51.54 | [GoogleDrive](https://drive.google.com/drive/folders/1OgbqpT6TSCPPsoJn0sh2wfXNy5mL5nk9?usp=share_link) |
## Training
Before training, please download [pre-trained SegFormer](https://drive.google.com/drive/folders/10XgSW8f7ghRs9fJ0dE-EV8G2E_guVsT5?usp=sharing), such as `checkpoints/pretrained/segformer/mit_b2.pth`.
```text
checkpoints/pretrained/segformer
├── mit_b2.pth
└── mit_b4.pth
```
To train CMNeXt model, please use change yaml file for `--cfg`. Several training examples using 4 A100 GPUs are:
```bash
cd path/to/DELIVER
conda activate cmnext
export PYTHONPATH="path/to/DELIVER"
python -m torch.distributed.launch --nproc_per_node=4 --use_env tools/train_mm.py --cfg configs/deliver_rgbdel.yaml
python -m torch.distributed.launch --nproc_per_node=4 --use_env tools/train_mm.py --cfg configs/kitti360_rgbdel.yaml
python -m torch.distributed.launch --nproc_per_node=4 --use_env tools/train_mm.py --cfg configs/nyu_rgbd.yaml
python -m torch.distributed.launch --nproc_per_node=4 --use_env tools/train_mm.py --cfg configs/mfnet_rgbt.yaml
python -m torch.distributed.launch --nproc_per_node=4 --use_env tools/train_mm.py --cfg configs/mcubes_rgbadn.yaml
python -m torch.distributed.launch --nproc_per_node=4 --use_env tools/train_mm.py --cfg configs/urbanlf.yaml
```
## Evaluation
To evaluate CMNeXt models, please download respective model weights ([**GoogleDrive**](https://drive.google.com/drive/folders/1MZaaZ5_rEVSjns3TBM0UDt6IW4X-HPIN?usp=share_link)) as:
```text
output/
├── DELIVER
│ ├── cmnext_b2_deliver_rgb.pth
│ ├── cmnext_b2_deliver_rgbd.pth
│ ├── cmnext_b2_deliver_rgbde.pth
│ ├── cmnext_b2_deliver_rgbdel.pth
│ ├── cmnext_b2_deliver_rgbdl.pth
│ ├── cmnext_b2_deliver_rgbe.pth
│ └── cmnext_b2_deliver_rgbl.pth
├── KITTI360
│ ├── cmnext_b2_kitti360_rgb.pth
│ ├── cmnext_b2_kitti360_rgbd.pth
│ ├── cmnext_b2_kitti360_rgbde.pth
│ ├── cmnext_b2_kitti360_rgbdel.pth
│ ├── cmnext_b2_kitti360_rgbdl.pth
│ ├── cmnext_b2_kitti360_rgbe.pth
│ └── cmnext_b2_kitti360_rgbl.pth
├── MCubeS
│ ├── cmnext_b2_mcubes_rgb.pth
│ ├── cmnext_b2_mcubes_rgba.pth
│ ├── cmnext_b2_mcubes_rgbad.pth
│ └── cmnext_b2_mcubes_rgbadn.pth
├── MFNet
│ └── cmnext_b4_mfnet_rgbt.pth
├── NYU_Depth_V2
│ └── cmnext_b4_nyu_rgbd.pth
├── UrbanLF
│ ├── cmnext_b4_urbanlf_real_rgblf1.pth
│ ├── cmnext_b4_urbanlf_real_rgblf33.pth
│ ├── cmnext_b4_urbanlf_real_rgblf8.pth
│ ├── cmnext_b4_urbanlf_real_rgblf80.pth
│ ├── cmnext_b4_urbanlf_syn_rgblf1.pth
│ ├── cmnext_b4_urbanlf_syn_rgblf33.pth
│ ├── cmnext_b4_urbanlf_syn_rgblf8.pth
│ └── cmnext_b4_urbanlf_syn_rgblf80.pth
```
Then, modify `--cfg` to respective config file, and run:
```bash
cd path/to/DELIVER
conda activate cmnext
export PYTHONPATH="path/to/DELIVER"
CUDA_VISIBLE_DEVICES=0 python tools/val_mm.py --cfg configs/deliver_rgbdel.yaml
```
On DeLiVER dataset, there are **validation** and **test** sets. Please check [*val_mm.py*](tools/val_mm.py) to modify the dataset for validation and test sets.
To evaluate the different cases (adverse weather conditions, sensor failures), modify the `cases` list at [*val_mm.py*](tools/val_mm.py), as shown below:
```python
# cases = ['cloud', 'fog', 'night', 'rain', 'sun']
# cases = ['motionblur', 'overexposure', 'underexposure', 'lidarjitter', 'eventlowres']
cases = [None] # all
```
Note that the default value is `[None]` for all cases.
### DELIVER visualization
<img src="figs/DELIVER_vis.png" width="500px">
The visualization results on DELIVER dataset. From left to right are the respective *cloudy*, *foggy*, *night* and *rainy* scene.
## Acknowledgements
Thanks for the public repositories:
- [RGBX-semantic-segmentation](https://github.com/huaaaliu/RGBX_Semantic_Segmentation)
- [Semantic-segmentation](https://github.com/sithu31296/semantic-segmentation)
## License
This repository is under the Apache-2.0 license. For commercial use, please contact with the authors.
## Citations
If you use DeLiVer dataset and CMNeXt model, please cite the following works:
- **DeLiVER & CMNeXt** [[**PDF**](https://arxiv.org/pdf/2303.01480.pdf)]
```
@inproceedings{zhang2023delivering,
title={Delivering Arbitrary-Modal Semantic Segmentation},
author={Zhang, Jiaming and Liu, Ruiping and Shi, Hao and Yang, Kailun and Rei{\ss}, Simon and Peng, Kunyu and Fu, Haodong and Wang, Kaiwei and Stiefelhagen, Rainer},
booktitle={CVPR},
year={2023}
}
```
- **CMX** [[**PDF**](https://arxiv.org/pdf/2203.04838.pdf)]
```
@article{zhang2023cmx,
title={CMX: Cross-modal fusion for RGB-X semantic segmentation with transformers},
author={Zhang, Jiaming and Liu, Huayao and Yang, Kailun and Hu, Xinxin and Liu, Ruiping and Stiefelhagen, Rainer},
journal={IEEE Transactions on Intelligent Transportation Systems},
year={2023}
}
```
================================================
FILE: configs/deliver_rgbdel.yaml
================================================
DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)
SAVE_DIR : 'output' # output folder name used for saving the model, logs and inference results
MODEL:
NAME : CMNeXt # name of the model you are using
BACKBONE : CMNeXt-B2 # model variant
PRETRAINED : 'checkpoints/pretrained/segformer/mit_b2.pth' # backbone model's weight
RESUME : '' # checkpoint file
DATASET:
NAME : DELIVER # dataset name to be trained with (camvid, cityscapes, ade20k)
ROOT : 'data/DELIVER' # dataset root path
IGNORE_LABEL : 255
# MODALS : ['img']
# MODALS : ['img', 'depth']
# MODALS : ['img', 'event']
# MODALS : ['img', 'lidar']
# MODALS : ['img', 'depth', 'event']
# MODALS : ['img', 'depth', 'lidar']
MODALS : ['img', 'depth', 'event', 'lidar']
TRAIN:
IMAGE_SIZE : [1024, 1024] # training image size in (h, w)
BATCH_SIZE : 2 # batch size used to train
EPOCHS : 200 # number of epochs to train
EVAL_START : 100 # evaluation interval start
EVAL_INTERVAL : 1 # evaluation interval during training
AMP : false # use AMP in training
DDP : true # use DDP training
LOSS:
NAME : OhemCrossEntropy # loss function name
CLS_WEIGHTS : false # use class weights in loss calculation
OPTIMIZER:
NAME : adamw # optimizer name
LR : 0.00006 # initial learning rate used in optimizer
WEIGHT_DECAY : 0.01 # decay rate used in optimizer
SCHEDULER:
NAME : warmuppolylr # scheduler name
POWER : 0.9 # scheduler power
WARMUP : 10 # warmup epochs used in scheduler
WARMUP_RATIO : 0.1 # warmup ratio
EVAL:
# MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgb.pth'
# MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbd.pth'
# MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbe.pth'
# MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbl.pth'
# MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbde.pth'
# MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbdl.pth'
MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbdel.pth'
IMAGE_SIZE : [1024, 1024] # evaluation image size in (h, w)
BATCH_SIZE : 4 # batch size used to train
MSF:
ENABLE : false # multi-scale and flip evaluation
FLIP : true # use flip in evaluation
SCALES : [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] # scales used in MSF evaluation
TEST:
MODEL_PATH : 'output/DELIVER/cmnext_b2_deliver_rgbdel.pth' # trained model file path
FILE : 'data/DELIVER' # filename or foldername
IMAGE_SIZE : [1024, 1024] # inference image size in (h, w)
OVERLAY : false # save the overlay result (image_alpha+label_alpha)
================================================
FILE: configs/kitti360_rgbdel.yaml
================================================
DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)
SAVE_DIR : 'output' # output folder name used for saving the model, logs and inference results
MODEL:
NAME : CMNeXt # name of the model you are using
BACKBONE : CMNeXt-B2 # model variant
PRETRAINED : 'checkpoints/pretrained/segformer/mit_b2.pth' # backbone model's weight
RESUME : '' # checkpoint file
DATASET:
NAME : KITTI360 # dataset name to be trained with (camvid, cityscapes, ade20k)
ROOT : 'data/KITTI360' # dataset root path
IGNORE_LABEL : 255
# MODALS : ['img']
# MODALS : ['img', 'depth']
# MODALS : ['img', 'event']
# MODALS : ['img', 'lidar']
# MODALS : ['img', 'depth', 'event']
# MODALS : ['img', 'depth', 'lidar']
MODALS : ['img', 'depth', 'event', 'lidar']
TRAIN:
IMAGE_SIZE : [376, 1408] # training image size in (h, w)
BATCH_SIZE : 4 # batch size used to train --- KD
EPOCHS : 40 # number of epochs to train
EVAL_START : 10 # evaluation interval during training
EVAL_INTERVAL : 1 # evaluation interval during training
AMP : false # use AMP in training
DDP : true # use DDP training
LOSS:
NAME : OhemCrossEntropy # loss function name
CLS_WEIGHTS : false # use class weights in loss calculation
OPTIMIZER:
NAME : adamw # optimizer name
LR : 0.00006 # initial learning rate used in optimizer
WEIGHT_DECAY : 0.01 # decay rate used in optimizer
SCHEDULER:
NAME : warmuppolylr # scheduler name
POWER : 0.9 # scheduler power
WARMUP : 10 # warmup epochs used in scheduler
WARMUP_RATIO : 0.1 # warmup ratio
EVAL:
# MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgb.pth'
# MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbd.pth'
# MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbe.pth'
# MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbl.pth'
# MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbde.pth'
# MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbdl.pth'
MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbdel.pth'
IMAGE_SIZE : [376, 1408] # evaluation image size in (h, w)
BATCH_SIZE : 4 # batch size used to train
MSF:
ENABLE : false # multi-scale and flip evaluation
FLIP : true # use flip in evaluation
SCALES : [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] # scales used in MSF evaluation
TEST:
MODEL_PATH : 'output/KITTI360/cmnext_b2_kitti360_rgbdel.pth' # trained model file path
FILE : 'data/KITTI360' # filename or foldername
IMAGE_SIZE : [376, 1408] # inference image size in (h, w)
OVERLAY : false # save the overlay result (image_alpha+label_alpha)
================================================
FILE: configs/mcubes_rgbadn.yaml
================================================
DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)
SAVE_DIR : 'output' # output folder name used for saving the model, logs and inference results
MODEL:
NAME : CMNeXt # name of the model you are using
BACKBONE : CMNeXt-B2 # model variant
PRETRAINED : 'checkpoints/pretrained/segformer/mit_b4.pth' # backbone model's weight
RESUME : '' # checkpoint file
DATASET:
NAME : MCubeS # dataset name to be trained with (camvid, cityscapes, ade20k)
ROOT : 'data/MCubeS/multimodal_dataset' # dataset root path
IGNORE_LABEL : 255
# MODALS : ['image'] #
# MODALS : ['image', 'aolp']
# MODALS : ['image', 'aolp', 'dolp']
MODALS : ['image', 'aolp', 'dolp', 'nir']
TRAIN:
IMAGE_SIZE : [512, 512] # training image size in (h, w) === Fixed in dataloader, following MCubeSNet
BATCH_SIZE : 4 # batch size used to train
EPOCHS : 500 # number of epochs to train
EVAL_START : 400 # evaluation interval during training
EVAL_INTERVAL : 1 # evaluation interval during training
AMP : false # use AMP in training
DDP : true # use DDP training
LOSS:
NAME : OhemCrossEntropy # loss function name
CLS_WEIGHTS : false # use class weights in loss calculation
OPTIMIZER:
NAME : adamw # optimizer name
LR : 0.00006 # initial learning rate used in optimizer
WEIGHT_DECAY : 0.01 # decay rate used in optimizer
SCHEDULER:
NAME : warmuppolylr # scheduler name
POWER : 0.9 # scheduler power
WARMUP : 10 # warmup epochs used in scheduler
WARMUP_RATIO : 0.1 # warmup ratio
EVAL:
# MODEL_PATH : 'output/MCubeS/cmnext_b2_mcubes_rgb.pth'
# MODEL_PATH : 'output/MCubeS/cmnext_b2_mcubes_rgba.pth'
# MODEL_PATH : 'output/MCubeS/cmnext_b2_mcubes_rgbad.pth'
MODEL_PATH : 'output/MCubeS/cmnext_b2_mcubes_rgbadn.pth'
IMAGE_SIZE : [1024, 1024] # evaluation image size in (h, w)
BATCH_SIZE : 2 # batch size used to train
MSF:
ENABLE : false # multi-scale and flip evaluation
FLIP : true # use flip in evaluation
SCALES : [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] # scales used in MSF evaluation
================================================
FILE: configs/mfnet_rgbt.yaml
================================================
DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)
SAVE_DIR : 'output' # output folder name used for saving the model, logs and inference results
MODEL:
NAME : CMNeXt # name of the model you are using
BACKBONE : CMNeXt-B4 # model variant
PRETRAINED : 'checkpoints/pretrained/segformer/mit_b4.pth' # backbone model's weight
RESUME : '' # checkpoint file
DATASET:
NAME : MFNet # dataset name to be trained with (camvid, cityscapes, ade20k)
ROOT : 'data/MFNet' # dataset root path
IGNORE_LABEL : 255
# MODALS : ['img']
MODALS : ['img', 'thermal']
TRAIN:
IMAGE_SIZE : [480, 640] # training image size in (h, w)
BATCH_SIZE : 4 # batch size used to train
EPOCHS : 500 # number of epochs to train
EVAL_START : 300 # evaluation interval during training
EVAL_INTERVAL : 1 # evaluation interval during training
AMP : false # use AMP in training
DDP : true # use DDP training
LOSS:
NAME : CrossEntropy # loss function name (ohemce, ce, dice)
CLS_WEIGHTS : false # use class weights in loss calculation
OPTIMIZER:
NAME : adamw # optimizer name
LR : 0.00006 # initial learning rate used in optimizer
WEIGHT_DECAY : 0.01 # decay rate used in optimizer
SCHEDULER:
NAME : warmuppolylr # scheduler name
POWER : 0.9 # scheduler power
WARMUP : 10 # warmup epochs used in scheduler
WARMUP_RATIO : 0.1 # warmup ratio
EVAL:
MODEL_PATH : 'output/MFNet/cmnext_b4_mfnet_rgbt.pth'
IMAGE_SIZE : [480, 640] # evaluation image size in (h, w)
BATCH_SIZE : 2 # batch size used to train
MSF:
ENABLE : false # multi-scale and flip evaluation
FLIP : true # use flip in evaluation
SCALES : [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] # scales used in MSF evaluation
================================================
FILE: configs/nyu_rgbd.yaml
================================================
DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)
SAVE_DIR : 'output' # output folder name used for saving the model, logs and inference results
MODEL:
NAME : CMNeXt # name of the model you are using
BACKBONE : CMNeXt-B4 # model variant
PRETRAINED : 'checkpoints/pretrained/segformer/mit_b4.pth' # backbone model's weight
RESUME : '' # checkpoint file
DATASET:
NAME : NYU # dataset name to be trained with (camvid, cityscapes, ade20k)
ROOT : 'data/NYUDepthv2' # dataset root path
IGNORE_LABEL : 255
# MODALS : ['img']
MODALS : ['img', 'depth']
TRAIN:
IMAGE_SIZE : [480, 640] # training image size in (h, w)
BATCH_SIZE : 4 # batch size used to train
EPOCHS : 500 # number of epochs to train
EVAL_START : 300 # evaluation interval during training
EVAL_INTERVAL : 1 # evaluation interval during training
AMP : false # use AMP in training
DDP : true # use DDP training
LOSS:
NAME : CrossEntropy # loss function name
CLS_WEIGHTS : false # use class weights in loss calculation
OPTIMIZER:
NAME : adamw # optimizer name
LR : 0.00006 # initial learning rate used in optimizer
WEIGHT_DECAY : 0.01 # decay rate used in optimizer
SCHEDULER:
NAME : warmuppolylr # scheduler name
POWER : 0.9 # scheduler power
WARMUP : 10 # warmup epochs used in scheduler
WARMUP_RATIO : 0.1 # warmup ratio
EVAL:
MODEL_PATH : 'output/NYU_Depth_V2/cmnext_b4_nyu_rgbd.pth'
IMAGE_SIZE : [480, 640] # evaluation image size in (h, w)
BATCH_SIZE : 2 # batch size used to train
MSF:
ENABLE : true # multi-scale and flip evaluation
FLIP : true # use flip in evaluation
SCALES : [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] # scales used in MSF evaluation
================================================
FILE: configs/urbanlf.yaml
================================================
DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)
SAVE_DIR : 'output' # output folder name used for saving the model, logs and inference results
MODEL:
NAME : CMNeXt # name of the model you are using
BACKBONE : CMNeXt-B4 # model variant
PRETRAINED : 'checkpoints/pretrained/segformer/mit_b4.pth' # backbone model's weight
RESUME : '' # checkpoint file
DATASET:
NAME : UrbanLF # dataset name to be trained with (camvid, cityscapes, ade20k)
# ROOT : 'data/UrBanLF/real' # dataset root path, for real dataset
ROOT : 'data/UrBanLF/Syn' # dataset root path, for synthetic dataset
IGNORE_LABEL : 255
# MODALS : ['img']
# MODALS : ['img', '5_1', '5_2', '5_3', '5_4', '5_6', '5_7', '5_8', '5_9']
# MODALS : ['img', '1_1', '1_5', '1_9', '2_2', '2_5', '2_8', '3_3', '3_5', '3_7', '4_4', '4_5', '4_6', '5_1', '5_2', '5_3', '5_4', '5_6', '5_7', '5_8', '5_9', '6_4', '6_5', '6_6', '7_3', '7_5', '7_7', '8_2', '8_5', '8_8', '9_1', '9_5', '9_9']
MODALS : ['img', '1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '2_1', '2_2', '2_3', '2_4', '2_5', '2_6', '2_7', '2_8', '2_9', '3_1', '3_2', '3_3', '3_4', '3_5', '3_6', '3_7', '3_8', '3_9', '4_1', '4_2', '4_3', '4_4', '4_5', '4_6', '4_7', '4_8', '4_9', '5_1', '5_2', '5_3', '5_4', '5_6', '5_7', '5_8', '5_9', '6_1', '6_2', '6_3', '6_4', '6_5', '6_6', '6_7', '6_8', '6_9', '7_1', '7_2', '7_3', '7_4', '7_5', '7_6', '7_7', '7_8', '7_9', '8_1', '8_2', '8_3', '8_4', '8_5', '8_6', '8_7', '8_8', '8_9', '9_1', '9_2', '9_3', '9_4', '9_5', '9_6', '9_7', '9_8', '9_9']
TRAIN:
IMAGE_SIZE : [480, 640] # training image size in (h, w)
BATCH_SIZE : 2 # batch size used to train
EPOCHS : 500 # number of epochs to train
EVAL_START : 300 # evaluation interval start
EVAL_INTERVAL : 1 # evaluation interval during training
AMP : false # use AMP in training
DDP : true # use DDP training
LOSS:
NAME : OhemCrossEntropy # loss function name
CLS_WEIGHTS : false # use class weights in loss calculation
OPTIMIZER:
NAME : adamw # optimizer name
LR : 0.00006 # initial learning rate used in optimizer
WEIGHT_DECAY : 0.01 # decay rate used in optimizer
SCHEDULER:
NAME : warmuppolylr # scheduler name
POWER : 0.9 # scheduler power
WARMUP : 10 # warmup epochs used in scheduler
WARMUP_RATIO : 0.1 # warmup ratio
EVAL:
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_real_rgblf1.pth'
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_real_rgblf8.pth'
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_real_rgblf33.pth'
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_real_rgblf80.pth'
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_syn_rgblf1.pth'
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_syn_rgblf8.pth'
# MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_syn_rgblf33.pth'
MODEL_PATH : 'output/UrbanLF/cmnext_b4_urbanlf_syn_rgblf80.pth'
IMAGE_SIZE : [480, 640] # eval image size in (h, w)
BATCH_SIZE : 2 # batch size used to train
MSF:
ENABLE : false # multi-scale and flip evaluation
FLIP : true # use flip in evaluation
SCALES : [0.5, 0.75, 1.0, 1.25, 1.5, 1.75] # scales used in MSF evaluation
================================================
FILE: environment.yaml
================================================
name: cmnext
channels:
- pytorch
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- _pytorch_select=0.1=cpu_0
- blas=1.0=mkl
- bzip2=1.0.8=h7f98852_4
- ca-certificates=2021.10.26=h06a4308_2
- certifi=2021.10.8=py38h06a4308_2
- cffi=1.14.6=py38ha65f79e_0
- cudatoolkit=11.3.1=h2bc3f7f_2
- cudnn=8.2.1.32=h86fa8c9_0
- ffmpeg=4.3=hf484d3e_0
- freetype=2.10.4=h5ab3b9f_0
- future=0.18.2=py38h578d9bd_4
- gmp=6.2.1=h58526e2_0
- gnutls=3.6.13=h85f3911_1
- intel-openmp=2021.3.0=h06a4308_3350
- jpeg=9d=h7f8727e_0
- lame=3.100=h7f98852_1001
- lcms2=2.12=h3be6417_0
- ld_impl_linux-64=2.35.1=h7274673_9
- libblas=3.9.0=11_linux64_mkl
- libffi=3.3=he6710b0_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libiconv=1.16=h516909a_0
- liblapack=3.9.0=11_linux64_mkl
- libpng=1.6.37=hbc83047_0
- libprotobuf=3.16.0=h780b84a_0
- libstdcxx-ng=9.3.0=hd4cf53a_17
- libtiff=4.2.0=h85742a9_0
- libuv=1.40.0=h7b6447c_0
- libwebp-base=1.2.0=h27cfd23_0
- lz4-c=1.9.3=h295c915_1
- magma=2.5.4=h6103c52_2
- mkl=2021.3.0=h06a4308_520
- mkl-service=2.4.0=py38h7f8727e_0
- mkl_fft=1.3.0=py38h42c9631_2
- mkl_random=1.2.2=py38h51133e4_0
- nccl=2.11.4.1=hdc17891_0
- ncurses=6.2=he6710b0_1
- nettle=3.6=he412f7d_0
- ninja=1.10.2=hff7bd54_1
- numpy=1.21.2=py38h20f2e39_0
- numpy-base=1.21.2=py38h79a1101_0
- olefile=0.46=pyhd3eb1b0_0
- openh264=2.1.1=h780b84a_0
- openjpeg=2.4.0=h3ad879b_0
- openssl=1.1.1m=h7f8727e_0
- pillow=8.3.1=py38h2c7a002_0
- pycparser=2.21=pyhd8ed1ab_0
- python=3.8.12=h12debd9_0
- python_abi=3.8=2_cp38
- pytorch=1.9.0=cuda112py38h3d13190_1
- pytorch-gpu=1.9.0=cuda112py38h0bbbad9_1
- pytorch-mutex=1.0=cuda
- readline=8.1=h27cfd23_0
- six=1.16.0=pyhd3eb1b0_0
- sleef=3.5.1=h7f98852_1
- sqlite=3.36.0=hc218d9a_0
- tk=8.6.11=h1ccaba5_0
- torchaudio=0.9.0=py38
- torchvision=0.10.0=py38cuda112h04b465a_0_cuda
- typing_extensions=3.10.0.2=pyh06a4308_0
- xz=5.2.5=h7b6447c_0
- yaml=0.2.5=h7b6447c_0
- zlib=1.2.11=h7b6447c_3
- zstd=1.4.9=haebb681_0
- pip:
- absl-py==1.2.0
- addict==2.4.0
- argon2-cffi==21.3.0
- argon2-cffi-bindings==21.2.0
- asttokens==2.0.5
- attrs==21.4.0
- backcall==0.2.0
- bleach==4.1.0
- cachetools==5.0.0
- charset-normalizer==2.1.1
- cycler==0.11.0
- dataclasses==0.6
- debugpy==1.5.1
- decorator==5.1.1
- defusedxml==0.7.1
- descartes==1.1.0
- easydict==1.9
- einops==0.4.1
- entrypoints==0.4
- executing==0.8.3
- fire==0.4.0
- fvcore==0.1.5.post20220512
- google-auth==2.11.0
- google-auth-oauthlib==0.4.6
- grpcio==1.48.1
- idna==3.3
- importlib-metadata==4.12.0
- importlib-resources==5.4.0
- iopath==0.1.10
- ipykernel==6.9.1
- ipython==8.1.0
- ipython-genutils==0.2.0
- ipywidgets==7.6.5
- jedi==0.18.1
- jinja2==3.0.3
- joblib==1.1.0
- jsonschema==4.4.0
- jupyter==1.0.0
- jupyter-client==7.1.2
- jupyter-console==6.4.0
- jupyter-core==4.9.2
- jupyterlab-pygments==0.1.2
- jupyterlab-widgets==1.0.2
- kiwisolver==1.3.2
- markdown==3.4.1
- markupsafe==2.1.1
- matplotlib==3.4.3
- matplotlib-inline==0.1.3
- mistune==0.8.4
- mmcv-full==1.6.1
- nbclient==0.5.11
- nbconvert==6.4.2
- nbformat==5.1.3
- nest-asyncio==1.5.4
- notebook==6.4.8
- nuscenes-devkit==1.1.9
- oauthlib==3.2.1
- opencv-python==4.5.3.56
- packaging==21.3
- pandocfilters==1.5.0
- parso==0.8.3
- pexpect==4.8.0
- pickleshare==0.7.5
- pip==22.0.3
- plyfile==0.7.4
- portalocker==2.5.1
- prometheus-client==0.13.1
- prompt-toolkit==3.0.28
- protobuf==3.18.1
- ptyprocess==0.7.0
- pure-eval==0.2.2
- pyasn1==0.4.8
- pyasn1-modules==0.2.8
- pycocotools==2.0.4
- pygments==2.11.2
- pyparsing==3.0.6
- pyquaternion==0.9.9
- pyrsistent==0.18.1
- python-dateutil==2.8.2
- pyyaml==6.0
- pyzmq==22.3.0
- qtconsole==5.2.2
- qtpy==2.0.1
- requests==2.28.1
- requests-oauthlib==1.3.1
- rsa==4.9
- scikit-learn==1.0.2
- scipy==1.7.1
- send2trash==1.8.0
- setuptools==59.5.0
- shapely==1.8.1.post1
- stack-data==0.2.0
- tabulate==0.8.10
- tensorboard==2.10.0
- tensorboard-data-server==0.6.1
- tensorboard-plugin-wit==1.8.1
- tensorboardx==2.4
- termcolor==1.1.0
- terminado==0.13.1
- testpath==0.6.0
- threadpoolctl==3.1.0
- timm==0.4.12
- tornado==6.1
- tqdm==4.62.3
- traitlets==5.1.1
- urllib3==1.26.12
- wcwidth==0.2.5
- webencodings==0.5.1
- werkzeug==2.2.2
- wheel==0.37.1
- widgetsnbextension==3.5.2
- yacs==0.1.8
- yapf==0.32.0
- zipp==3.7.0
================================================
FILE: requirements.txt
================================================
absl-py==1.2.0
addict==2.4.0
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.5
attrs==21.4.0
backcall==0.2.0
bleach==4.1.0
cachetools==5.0.0
charset-normalizer==2.1.1
cycler==0.11.0
dataclasses==0.6
debugpy==1.5.1
decorator==5.1.1
defusedxml==0.7.1
descartes==1.1.0
easydict==1.9
einops==0.4.1
entrypoints==0.4
executing==0.8.3
fire==0.4.0
fvcore==0.1.5.post20220512
google-auth==2.11.0
google-auth-oauthlib==0.4.6
grpcio==1.48.1
idna==3.3
importlib-metadata==4.12.0
importlib-resources==5.4.0
iopath==0.1.10
ipykernel==6.9.1
ipython==8.1.0
ipython-genutils==0.2.0
ipywidgets==7.6.5
jedi==0.18.1
jinja2==3.0.3
joblib==1.1.0
jsonschema==4.4.0
jupyter==1.0.0
jupyter-client==7.1.2
jupyter-console==6.4.0
jupyter-core==4.9.2
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.2
kiwisolver==1.3.2
markdown==3.4.1
markupsafe==2.1.1
matplotlib==3.4.3
matplotlib-inline==0.1.3
mistune==0.8.4
mmcv-full==1.6.1
nbclient==0.5.11
nbconvert==6.4.2
nbformat==5.1.3
nest-asyncio==1.5.4
notebook==6.4.8
nuscenes-devkit==1.1.9
oauthlib==3.2.1
opencv-python==4.5.3.56
packaging==21.3
pandocfilters==1.5.0
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
pip==22.0.3
plyfile==0.7.4
portalocker==2.5.1
prometheus-client==0.13.1
prompt-toolkit==3.0.28
protobuf==3.18.1
ptyprocess==0.7.0
pure-eval==0.2.2
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycocotools==2.0.4
pygments==2.11.2
pyparsing==3.0.6
pyquaternion==0.9.9
pyrsistent==0.18.1
python-dateutil==2.8.2
pyyaml==6.0
pyzmq==22.3.0
qtconsole==5.2.2
qtpy==2.0.1
requests==2.28.1
requests-oauthlib==1.3.1
rsa==4.9
scikit-learn==1.0.2
scipy==1.7.1
send2trash==1.8.0
setuptools==59.5.0
shapely==1.8.1.post1
stack-data==0.2.0
tabulate==0.8.10
tensorboard==2.10.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorboardx==2.4
termcolor==1.1.0
terminado==0.13.1
testpath==0.6.0
threadpoolctl==3.1.0
timm==0.4.12
tornado==6.1
tqdm==4.62.3
traitlets==5.1.1
urllib3==1.26.12
wcwidth==0.2.5
webencodings==0.5.1
werkzeug==2.2.2
wheel==0.37.1
widgetsnbextension==3.5.2
yacs==0.1.8
yapf==0.32.0
zipp==3.7.0
================================================
FILE: semseg/__init__.py
================================================
from tabulate import tabulate
from semseg import models
from semseg import datasets
from semseg.models import backbones, heads
def show_models():
model_names = models.__all__
numbers = list(range(1, len(model_names)+1))
print(tabulate({'No.': numbers, 'Model Names': model_names}, headers='keys'))
def show_backbones():
backbone_names = backbones.__all__
variants = []
for name in backbone_names:
try:
variants.append(list(eval(f"backbones.{name.lower()}_settings").keys()))
except:
variants.append('-')
print(tabulate({'Backbone Names': backbone_names, 'Variants': variants}, headers='keys'))
def show_heads():
head_names = heads.__all__
numbers = list(range(1, len(head_names)+1))
print(tabulate({'No.': numbers, 'Heads': head_names}, headers='keys'))
def show_datasets():
dataset_names = datasets.__all__
numbers = list(range(1, len(dataset_names)+1))
print(tabulate({'No.': numbers, 'Datasets': dataset_names}, headers='keys'))
================================================
FILE: semseg/augmentations.py
================================================
import torchvision.transforms.functional as TF
import random
import math
import torch
from torch import Tensor
from typing import Tuple, List, Union, Tuple, Optional
class Compose:
def __init__(self, transforms: list) -> None:
self.transforms = transforms
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if mask.ndim == 2:
assert img.shape[1:] == mask.shape
else:
assert img.shape[1:] == mask.shape[1:]
for transform in self.transforms:
img, mask = transform(img, mask)
return img, mask
class Normalize:
def __init__(self, mean: list = (0.485, 0.456, 0.406), std: list = (0.229, 0.224, 0.225)):
self.mean = mean
self.std = std
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
img = img.float()
img /= 255
img = TF.normalize(img, self.mean, self.std)
return img, mask
class ColorJitter:
def __init__(self, brightness=0, contrast=0, saturation=0, hue=0) -> None:
self.brightness = brightness
self.contrast = contrast
self.saturation = saturation
self.hue = hue
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if self.brightness > 0:
img = TF.adjust_brightness(img, self.brightness)
if self.contrast > 0:
img = TF.adjust_contrast(img, self.contrast)
if self.saturation > 0:
img = TF.adjust_saturation(img, self.saturation)
if self.hue > 0:
img = TF.adjust_hue(img, self.hue)
return img, mask
class AdjustGamma:
def __init__(self, gamma: float, gain: float = 1) -> None:
"""
Args:
gamma: Non-negative real number. gamma larger than 1 make the shadows darker, while gamma smaller than 1 make dark regions lighter.
gain: constant multiplier
"""
self.gamma = gamma
self.gain = gain
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
return TF.adjust_gamma(img, self.gamma, self.gain), mask
class RandomAdjustSharpness:
def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
self.sharpness = sharpness_factor
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
img = TF.adjust_sharpness(img, self.sharpness)
return img, mask
class RandomAutoContrast:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
img = TF.autocontrast(img)
return img, mask
class RandomGaussianBlur:
def __init__(self, kernel_size: int = 3, p: float = 0.5) -> None:
self.kernel_size = kernel_size
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
img = TF.gaussian_blur(img, self.kernel_size)
return img, mask
class RandomHorizontalFlip:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
return TF.hflip(img), TF.hflip(mask)
return img, mask
class RandomVerticalFlip:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
return TF.vflip(img), TF.vflip(mask)
return img, mask
class RandomGrayscale:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
img = TF.rgb_to_grayscale(img, 3)
return img, mask
class Equalize:
def __call__(self, image, label):
return TF.equalize(image), label
class Posterize:
def __init__(self, bits=2):
self.bits = bits # 0-8
def __call__(self, image, label):
return TF.posterize(image, self.bits), label
class Affine:
def __init__(self, angle=0, translate=[0, 0], scale=1.0, shear=[0, 0], seg_fill=0):
self.angle = angle
self.translate = translate
self.scale = scale
self.shear = shear
self.seg_fill = seg_fill
def __call__(self, img, label):
return TF.affine(img, self.angle, self.translate, self.scale, self.shear, TF.InterpolationMode.BILINEAR, 0), TF.affine(label, self.angle, self.translate, self.scale, self.shear, TF.InterpolationMode.NEAREST, self.seg_fill)
class RandomRotation:
def __init__(self, degrees: float = 10.0, p: float = 0.2, seg_fill: int = 0, expand: bool = False) -> None:
"""Rotate the image by a random angle between -angle and angle with probability p
Args:
p: probability
angle: rotation angle value in degrees, counter-clockwise.
expand: Optional expansion flag.
If true, expands the output image to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image.
Note that the expand flag assumes rotation around the center and no translation.
"""
self.p = p
self.angle = degrees
self.expand = expand
self.seg_fill = seg_fill
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
random_angle = random.random() * 2 * self.angle - self.angle
if random.random() < self.p:
img = TF.rotate(img, random_angle, TF.InterpolationMode.BILINEAR, self.expand, fill=0)
mask = TF.rotate(mask, random_angle, TF.InterpolationMode.NEAREST, self.expand, fill=self.seg_fill)
return img, mask
class CenterCrop:
def __init__(self, size: Union[int, List[int], Tuple[int]]) -> None:
"""Crops the image at the center
Args:
output_size: height and width of the crop box. If int, this size is used for both directions.
"""
self.size = (size, size) if isinstance(size, int) else size
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
return TF.center_crop(img, self.size), TF.center_crop(mask, self.size)
class RandomCrop:
def __init__(self, size: Union[int, List[int], Tuple[int]], p: float = 0.5) -> None:
"""Randomly Crops the image.
Args:
output_size: height and width of the crop box. If int, this size is used for both directions.
"""
self.size = (size, size) if isinstance(size, int) else size
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
H, W = img.shape[1:]
tH, tW = self.size
if random.random() < self.p:
margin_h = max(H - tH, 0)
margin_w = max(W - tW, 0)
y1 = random.randint(0, margin_h+1)
x1 = random.randint(0, margin_w+1)
y2 = y1 + tH
x2 = x1 + tW
img = img[:, y1:y2, x1:x2]
mask = mask[:, y1:y2, x1:x2]
return img, mask
class Pad:
def __init__(self, size: Union[List[int], Tuple[int], int], seg_fill: int = 0) -> None:
"""Pad the given image on all sides with the given "pad" value.
Args:
size: expected output image size (h, w)
fill: Pixel fill value for constant fill. Default is 0. This value is only used when the padding mode is constant.
"""
self.size = size
self.seg_fill = seg_fill
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
padding = (0, 0, self.size[1]-img.shape[2], self.size[0]-img.shape[1])
return TF.pad(img, padding), TF.pad(mask, padding, self.seg_fill)
class ResizePad:
def __init__(self, size: Union[int, Tuple[int], List[int]], seg_fill: int = 0) -> None:
"""Resize the input image to the given size.
Args:
size: Desired output size.
If size is a sequence, the output size will be matched to this.
If size is an int, the smaller edge of the image will be matched to this number maintaining the aspect ratio.
"""
self.size = size
self.seg_fill = seg_fill
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
H, W = img.shape[1:]
tH, tW = self.size
# scale the image
scale_factor = min(tH/H, tW/W) if W > H else max(tH/H, tW/W)
# nH, nW = int(H * scale_factor + 0.5), int(W * scale_factor + 0.5)
nH, nW = round(H*scale_factor), round(W*scale_factor)
img = TF.resize(img, (nH, nW), TF.InterpolationMode.BILINEAR)
mask = TF.resize(mask, (nH, nW), TF.InterpolationMode.NEAREST)
# pad the image
padding = [0, 0, tW - nW, tH - nH]
img = TF.pad(img, padding, fill=0)
mask = TF.pad(mask, padding, fill=self.seg_fill)
return img, mask
class Resize:
def __init__(self, size: Union[int, Tuple[int], List[int]]) -> None:
"""Resize the input image to the given size.
Args:
size: Desired output size.
If size is a sequence, the output size will be matched to this.
If size is an int, the smaller edge of the image will be matched to this number maintaining the aspect ratio.
"""
self.size = size
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
H, W = img.shape[1:]
# scale the image
scale_factor = self.size[0] / min(H, W)
nH, nW = round(H*scale_factor), round(W*scale_factor)
img = TF.resize(img, (nH, nW), TF.InterpolationMode.BILINEAR)
mask = TF.resize(mask, (nH, nW), TF.InterpolationMode.NEAREST)
# make the image divisible by stride
alignH, alignW = int(math.ceil(nH / 32)) * 32, int(math.ceil(nW / 32)) * 32
img = TF.resize(img, (alignH, alignW), TF.InterpolationMode.BILINEAR)
mask = TF.resize(mask, (alignH, alignW), TF.InterpolationMode.NEAREST)
return img, mask
class RandomResizedCrop:
def __init__(self, size: Union[int, Tuple[int], List[int]], scale: Tuple[float, float] = (0.5, 2.0), seg_fill: int = 0) -> None:
"""Resize the input image to the given size.
"""
self.size = size
self.scale = scale
self.seg_fill = seg_fill
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
H, W = img.shape[1:]
tH, tW = self.size
# get the scale
ratio = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
# ratio = random.uniform(min(self.scale), max(self.scale))
scale = int(tH*ratio), int(tW*4*ratio)
# scale the image
scale_factor = min(max(scale)/max(H, W), min(scale)/min(H, W))
nH, nW = int(H * scale_factor + 0.5), int(W * scale_factor + 0.5)
# nH, nW = int(math.ceil(nH / 32)) * 32, int(math.ceil(nW / 32)) * 32
img = TF.resize(img, (nH, nW), TF.InterpolationMode.BILINEAR)
mask = TF.resize(mask, (nH, nW), TF.InterpolationMode.NEAREST)
# random crop
margin_h = max(img.shape[1] - tH, 0)
margin_w = max(img.shape[2] - tW, 0)
y1 = random.randint(0, margin_h+1)
x1 = random.randint(0, margin_w+1)
y2 = y1 + tH
x2 = x1 + tW
img = img[:, y1:y2, x1:x2]
mask = mask[:, y1:y2, x1:x2]
# pad the image
if img.shape[1:] != self.size:
padding = [0, 0, tW - img.shape[2], tH - img.shape[1]]
img = TF.pad(img, padding, fill=0)
mask = TF.pad(mask, padding, fill=self.seg_fill)
return img, mask
def get_train_augmentation(size: Union[int, Tuple[int], List[int]], seg_fill: int = 0):
return Compose([
# ColorJitter(brightness=0.0, contrast=0.5, saturation=0.5, hue=0.5),
# RandomAdjustSharpness(sharpness_factor=0.1, p=0.5),
# RandomAutoContrast(p=0.2),
RandomHorizontalFlip(p=0.5),
# RandomVerticalFlip(p=0.5),
# RandomGaussianBlur((3, 3), p=0.5),
# RandomGrayscale(p=0.5),
# RandomRotation(degrees=10, p=0.3, seg_fill=seg_fill),
RandomResizedCrop(size, scale=(0.5, 2.0), seg_fill=seg_fill),
Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
def get_val_augmentation(size: Union[int, Tuple[int], List[int]]):
return Compose([
Resize(size),
Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
if __name__ == '__main__':
h = 230
w = 420
img = torch.randn(3, h, w)
mask = torch.randn(1, h, w)
aug = Compose([
RandomResizedCrop((512, 512)),
# RandomCrop((512, 512), p=1.0),
# Pad((512, 512))
])
img, mask = aug(img, mask)
print(img.shape, mask.shape)
================================================
FILE: semseg/augmentations_mm.py
================================================
import torchvision.transforms.functional as TF
import random
import math
import torch
from torch import Tensor
from typing import Tuple, List, Union, Tuple, Optional
class Compose:
def __init__(self, transforms: list) -> None:
self.transforms = transforms
def __call__(self, sample: list) -> list:
img, mask = sample['img'], sample['mask']
if mask.ndim == 2:
assert img.shape[1:] == mask.shape
else:
assert img.shape[1:] == mask.shape[1:]
for transform in self.transforms:
sample = transform(sample)
return sample
class Normalize:
def __init__(self, mean: list = (0.485, 0.456, 0.406), std: list = (0.229, 0.224, 0.225)):
self.mean = mean
self.std = std
def __call__(self, sample: list) -> list:
for k, v in sample.items():
if k == 'mask':
continue
elif k == 'img':
sample[k] = sample[k].float()
sample[k] /= 255
sample[k] = TF.normalize(sample[k], self.mean, self.std)
else:
sample[k] = sample[k].float()
sample[k] /= 255
return sample
class RandomColorJitter:
def __init__(self, p=0.5) -> None:
self.p = p
def __call__(self, sample: list) -> list:
if random.random() < self.p:
self.brightness = random.uniform(0.5, 1.5)
sample['img'] = TF.adjust_brightness(sample['img'], self.brightness)
self.contrast = random.uniform(0.5, 1.5)
sample['img'] = TF.adjust_contrast(sample['img'], self.contrast)
self.saturation = random.uniform(0.5, 1.5)
sample['img'] = TF.adjust_saturation(sample['img'], self.saturation)
return sample
class AdjustGamma:
def __init__(self, gamma: float, gain: float = 1) -> None:
"""
Args:
gamma: Non-negative real number. gamma larger than 1 make the shadows darker, while gamma smaller than 1 make dark regions lighter.
gain: constant multiplier
"""
self.gamma = gamma
self.gain = gain
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
return TF.adjust_gamma(img, self.gamma, self.gain), mask
class RandomAdjustSharpness:
def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
self.sharpness = sharpness_factor
self.p = p
def __call__(self, sample: list) -> list:
if random.random() < self.p:
sample['img'] = TF.adjust_sharpness(sample['img'], self.sharpness)
return sample
class RandomAutoContrast:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, sample: list) -> list:
if random.random() < self.p:
sample['img'] = TF.autocontrast(sample['img'])
return sample
class RandomGaussianBlur:
def __init__(self, kernel_size: int = 3, p: float = 0.5) -> None:
self.kernel_size = kernel_size
self.p = p
def __call__(self, sample: list) -> list:
if random.random() < self.p:
sample['img'] = TF.gaussian_blur(sample['img'], self.kernel_size)
# img = TF.gaussian_blur(img, self.kernel_size)
return sample
class RandomHorizontalFlip:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, sample: list) -> list:
if random.random() < self.p:
for k, v in sample.items():
sample[k] = TF.hflip(v)
return sample
return sample
class RandomVerticalFlip:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
return TF.vflip(img), TF.vflip(mask)
return img, mask
class RandomGrayscale:
def __init__(self, p: float = 0.5) -> None:
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
if random.random() < self.p:
img = TF.rgb_to_grayscale(img, 3)
return img, mask
class Equalize:
def __call__(self, image, label):
return TF.equalize(image), label
class Posterize:
def __init__(self, bits=2):
self.bits = bits # 0-8
def __call__(self, image, label):
return TF.posterize(image, self.bits), label
class Affine:
def __init__(self, angle=0, translate=[0, 0], scale=1.0, shear=[0, 0], seg_fill=0):
self.angle = angle
self.translate = translate
self.scale = scale
self.shear = shear
self.seg_fill = seg_fill
def __call__(self, img, label):
return TF.affine(img, self.angle, self.translate, self.scale, self.shear, TF.InterpolationMode.BILINEAR, 0), TF.affine(label, self.angle, self.translate, self.scale, self.shear, TF.InterpolationMode.NEAREST, self.seg_fill)
class RandomRotation:
def __init__(self, degrees: float = 10.0, p: float = 0.2, seg_fill: int = 0, expand: bool = False) -> None:
"""Rotate the image by a random angle between -angle and angle with probability p
Args:
p: probability
angle: rotation angle value in degrees, counter-clockwise.
expand: Optional expansion flag.
If true, expands the output image to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image.
Note that the expand flag assumes rotation around the center and no translation.
"""
self.p = p
self.angle = degrees
self.expand = expand
self.seg_fill = seg_fill
def __call__(self, sample: list) -> list:
random_angle = random.random() * 2 * self.angle - self.angle
if random.random() < self.p:
for k, v in sample.items():
if k == 'mask':
sample[k] = TF.rotate(v, random_angle, TF.InterpolationMode.NEAREST, self.expand, fill=self.seg_fill)
else:
sample[k] = TF.rotate(v, random_angle, TF.InterpolationMode.BILINEAR, self.expand, fill=0)
# img = TF.rotate(img, random_angle, TF.InterpolationMode.BILINEAR, self.expand, fill=0)
# mask = TF.rotate(mask, random_angle, TF.InterpolationMode.NEAREST, self.expand, fill=self.seg_fill)
return sample
class CenterCrop:
def __init__(self, size: Union[int, List[int], Tuple[int]]) -> None:
"""Crops the image at the center
Args:
output_size: height and width of the crop box. If int, this size is used for both directions.
"""
self.size = (size, size) if isinstance(size, int) else size
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
return TF.center_crop(img, self.size), TF.center_crop(mask, self.size)
class RandomCrop:
def __init__(self, size: Union[int, List[int], Tuple[int]], p: float = 0.5) -> None:
"""Randomly Crops the image.
Args:
output_size: height and width of the crop box. If int, this size is used for both directions.
"""
self.size = (size, size) if isinstance(size, int) else size
self.p = p
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
H, W = img.shape[1:]
tH, tW = self.size
if random.random() < self.p:
margin_h = max(H - tH, 0)
margin_w = max(W - tW, 0)
y1 = random.randint(0, margin_h+1)
x1 = random.randint(0, margin_w+1)
y2 = y1 + tH
x2 = x1 + tW
img = img[:, y1:y2, x1:x2]
mask = mask[:, y1:y2, x1:x2]
return img, mask
class Pad:
def __init__(self, size: Union[List[int], Tuple[int], int], seg_fill: int = 0) -> None:
"""Pad the given image on all sides with the given "pad" value.
Args:
size: expected output image size (h, w)
fill: Pixel fill value for constant fill. Default is 0. This value is only used when the padding mode is constant.
"""
self.size = size
self.seg_fill = seg_fill
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
padding = (0, 0, self.size[1]-img.shape[2], self.size[0]-img.shape[1])
return TF.pad(img, padding), TF.pad(mask, padding, self.seg_fill)
class ResizePad:
def __init__(self, size: Union[int, Tuple[int], List[int]], seg_fill: int = 0) -> None:
"""Resize the input image to the given size.
Args:
size: Desired output size.
If size is a sequence, the output size will be matched to this.
If size is an int, the smaller edge of the image will be matched to this number maintaining the aspect ratio.
"""
self.size = size
self.seg_fill = seg_fill
def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
H, W = img.shape[1:]
tH, tW = self.size
# scale the image
scale_factor = min(tH/H, tW/W) if W > H else max(tH/H, tW/W)
# nH, nW = int(H * scale_factor + 0.5), int(W * scale_factor + 0.5)
nH, nW = round(H*scale_factor), round(W*scale_factor)
img = TF.resize(img, (nH, nW), TF.InterpolationMode.BILINEAR)
mask = TF.resize(mask, (nH, nW), TF.InterpolationMode.NEAREST)
# pad the image
padding = [0, 0, tW - nW, tH - nH]
img = TF.pad(img, padding, fill=0)
mask = TF.pad(mask, padding, fill=self.seg_fill)
return img, mask
class Resize:
def __init__(self, size: Union[int, Tuple[int], List[int]]) -> None:
"""Resize the input image to the given size.
Args:
size: Desired output size.
If size is a sequence, the output size will be matched to this.
If size is an int, the smaller edge of the image will be matched to this number maintaining the aspect ratio.
"""
self.size = size
def __call__(self, sample:list) -> list:
H, W = sample['img'].shape[1:]
# scale the image
scale_factor = self.size[0] / min(H, W)
nH, nW = round(H*scale_factor), round(W*scale_factor)
for k, v in sample.items():
if k == 'mask':
sample[k] = TF.resize(v, (nH, nW), TF.InterpolationMode.NEAREST)
else:
sample[k] = TF.resize(v, (nH, nW), TF.InterpolationMode.BILINEAR)
# img = TF.resize(img, (nH, nW), TF.InterpolationMode.BILINEAR)
# mask = TF.resize(mask, (nH, nW), TF.InterpolationMode.NEAREST)
# make the image divisible by stride
alignH, alignW = int(math.ceil(nH / 32)) * 32, int(math.ceil(nW / 32)) * 32
for k, v in sample.items():
if k == 'mask':
sample[k] = TF.resize(v, (alignH, alignW), TF.InterpolationMode.NEAREST)
else:
sample[k] = TF.resize(v, (alignH, alignW), TF.InterpolationMode.BILINEAR)
# img = TF.resize(img, (alignH, alignW), TF.InterpolationMode.BILINEAR)
# mask = TF.resize(mask, (alignH, alignW), TF.InterpolationMode.NEAREST)
return sample
class RandomResizedCrop:
def __init__(self, size: Union[int, Tuple[int], List[int]], scale: Tuple[float, float] = (0.5, 2.0), seg_fill: int = 0) -> None:
"""Resize the input image to the given size.
"""
self.size = size
self.scale = scale
self.seg_fill = seg_fill
def __call__(self, sample: list) -> list:
# img, mask = sample['img'], sample['mask']
H, W = sample['img'].shape[1:]
tH, tW = self.size
# get the scale
ratio = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
# ratio = random.uniform(min(self.scale), max(self.scale))
scale = int(tH*ratio), int(tW*4*ratio)
# scale the image
scale_factor = min(max(scale)/max(H, W), min(scale)/min(H, W))
nH, nW = int(H * scale_factor + 0.5), int(W * scale_factor + 0.5)
# nH, nW = int(math.ceil(nH / 32)) * 32, int(math.ceil(nW / 32)) * 32
for k, v in sample.items():
if k == 'mask':
sample[k] = TF.resize(v, (nH, nW), TF.InterpolationMode.NEAREST)
else:
sample[k] = TF.resize(v, (nH, nW), TF.InterpolationMode.BILINEAR)
# random crop
margin_h = max(sample['img'].shape[1] - tH, 0)
margin_w = max(sample['img'].shape[2] - tW, 0)
y1 = random.randint(0, margin_h+1)
x1 = random.randint(0, margin_w+1)
y2 = y1 + tH
x2 = x1 + tW
for k, v in sample.items():
sample[k] = v[:, y1:y2, x1:x2]
# pad the image
if sample['img'].shape[1:] != self.size:
padding = [0, 0, tW - sample['img'].shape[2], tH - sample['img'].shape[1]]
for k, v in sample.items():
if k == 'mask':
sample[k] = TF.pad(v, padding, fill=self.seg_fill)
else:
sample[k] = TF.pad(v, padding, fill=0)
return sample
def get_train_augmentation(size: Union[int, Tuple[int], List[int]], seg_fill: int = 0):
return Compose([
RandomColorJitter(p=0.2), #
RandomHorizontalFlip(p=0.5), #
RandomGaussianBlur((3, 3), p=0.2), #
RandomResizedCrop(size, scale=(0.5, 2.0), seg_fill=seg_fill), #
Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
def get_val_augmentation(size: Union[int, Tuple[int], List[int]]):
return Compose([
Resize(size),
Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
if __name__ == '__main__':
h = 230
w = 420
sample = {}
sample['img'] = torch.randn(3, h, w)
sample['depth'] = torch.randn(3, h, w)
sample['lidar'] = torch.randn(3, h, w)
sample['event'] = torch.randn(3, h, w)
sample['mask'] = torch.randn(1, h, w)
aug = Compose([
RandomHorizontalFlip(p=0.5),
RandomResizedCrop((512, 512)),
Resize((224, 224)),
Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])
sample = aug(sample)
for k, v in sample.items():
print(k, v.shape)
================================================
FILE: semseg/datasets/__init__.py
================================================
from .deliver import DELIVER
from .kitti360 import KITTI360
from .nyu import NYU
from .mfnet import MFNet
from .urbanlf import UrbanLF
from .mcubes import MCubeS
__all__ = [
'DELIVER',
'KITTI360',
'NYU',
'MFNet',
'UrbanLF',
'MCubeS'
]
================================================
FILE: semseg/datasets/deliver.py
================================================
import os
import torch
import numpy as np
from torch import Tensor
from torch.utils.data import Dataset
import torchvision.transforms.functional as TF
from torchvision import io
from pathlib import Path
from typing import Tuple
import glob
import einops
from torch.utils.data import DataLoader
from torch.utils.data import DistributedSampler, RandomSampler
from semseg.augmentations_mm import get_train_augmentation
class DELIVER(Dataset):
"""
num_classes: 25
"""
CLASSES = ["Building", "Fence", "Other", "Pedestrian", "Pole", "RoadLine", "Road", "SideWalk", "Vegetation",
"Cars", "Wall", "TrafficSign", "Sky", "Ground", "Bridge", "RailTrack", "GroundRail",
"TrafficLight", "Static", "Dynamic", "Water", "Terrain", "TwoWheeler", "Bus", "Truck"]
PALETTE = torch.tensor([[70, 70, 70],
[100, 40, 40],
[55, 90, 80],
[220, 20, 60],
[153, 153, 153],
[157, 234, 50],
[128, 64, 128],
[244, 35, 232],
[107, 142, 35],
[0, 0, 142],
[102, 102, 156],
[220, 220, 0],
[70, 130, 180],
[81, 0, 81],
[150, 100, 100],
[230, 150, 140],
[180, 165, 180],
[250, 170, 30],
[110, 190, 160],
[170, 120, 50],
[45, 60, 150],
[145, 170, 100],
[ 0, 0, 230],
[ 0, 60, 100],
[ 0, 0, 70],
])
def __init__(self, root: str = 'data/DELIVER', split: str = 'train', transform = None, modals = ['img'], case = None) -> None:
super().__init__()
assert split in ['train', 'val', 'test']
self.transform = transform
self.n_classes = len(self.CLASSES)
self.ignore_label = 255
self.modals = modals
self.files = sorted(glob.glob(os.path.join(*[root, 'img', '*', split, '*', '*.png'])))
# --- debug
# self.files = sorted(glob.glob(os.path.join(*[root, 'img', '*', split, '*', '*.png'])))[:100]
# --- split as case
if case is not None:
assert case in ['cloud', 'fog', 'night', 'rain', 'sun', 'motionblur', 'overexposure', 'underexposure', 'lidarjitter', 'eventlowres'], "Case name not available."
_temp_files = [f for f in self.files if case in f]
self.files = _temp_files
if not self.files:
raise Exception(f"No images found in {img_path}")
print(f"Found {len(self.files)} {split} {case} images.")
def __len__(self) -> int:
return len(self.files)
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
rgb = str(self.files[index])
x1 = rgb.replace('/img', '/hha').replace('_rgb', '_depth')
x2 = rgb.replace('/img', '/lidar').replace('_rgb', '_lidar')
x3 = rgb.replace('/img', '/event').replace('_rgb', '_event')
lbl_path = rgb.replace('/img', '/semantic').replace('_rgb', '_semantic')
sample = {}
sample['img'] = io.read_image(rgb)[:3, ...]
H, W = sample['img'].shape[1:]
if 'depth' in self.modals:
sample['depth'] = self._open_img(x1)
if 'lidar' in self.modals:
sample['lidar'] = self._open_img(x2)
if 'event' in self.modals:
eimg = self._open_img(x3)
sample['event'] = TF.resize(eimg, (H, W), TF.InterpolationMode.NEAREST)
label = io.read_image(lbl_path)[0,...].unsqueeze(0)
label[label==255] = 0
label -= 1
sample['mask'] = label
if self.transform:
sample = self.transform(sample)
label = sample['mask']
del sample['mask']
label = self.encode(label.squeeze().numpy()).long()
sample = [sample[k] for k in self.modals]
return sample, label
def _open_img(self, file):
img = io.read_image(file)
C, H, W = img.shape
if C == 4:
img = img[:3, ...]
if C == 1:
img = img.repeat(3, 1, 1)
return img
def encode(self, label: Tensor) -> Tensor:
return torch.from_numpy(label)
if __name__ == '__main__':
cases = ['cloud', 'fog', 'night', 'rain', 'sun', 'motionblur', 'overexposure', 'underexposure', 'lidarjitter', 'eventlowres']
traintransform = get_train_augmentation((1024, 1024), seg_fill=255)
for case in cases:
trainset = DELIVER(transform=traintransform, split='val', case=case)
trainloader = DataLoader(trainset, batch_size=2, num_workers=2, drop_last=False, pin_memory=False)
for i, (sample, lbl) in enumerate(trainloader):
print(torch.unique(lbl))
================================================
FILE: semseg/datasets/kitti360.py
================================================
import os
import torch
import numpy as np
from torch import Tensor
from torch.utils.data import Dataset
from torchvision import io
from pathlib import Path
from typing import Tuple
import glob
import einops
from torch.utils.data import DataLoader
from torch.utils.data import DistributedSampler, RandomSampler
from semseg.augmentations_mm import get_train_augmentation
class KITTI360(Dataset):
"""
num_classes: 19
"""
CLASSES = ['road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation',
'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle']
PALETTE = torch.tensor([[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], [107, 142, 35],
[152, 251, 152], [70, 130, 180], [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
ID2TRAINID = {0:255, 1:255, 2:255, 3:255, 4:255, 5:255, 6:255, 7:0, 8:1, 9:255, 10:255, 11:2, 12:3, 13:4, 14:255, 15:255, 16:255, 17:5, 18:255, 19:6,
20:7, 21:8, 22:9, 23:10, 24:11, 25:12, 26:13, 27:14, 28:15, 29:255, 30:255, 31:16, 32:17, 33:18, 34:2, 35:4, 36:255, 37:5, 38:255, 39:255, 40:255, 41:255, 42:255, 43:255, 44:255, -1:255}
def __init__(self, root: str = 'data/KITTI360', split: str = 'train', transform = None, modals = ['img', 'depth', 'event', 'lidar'], case = None) -> None:
super().__init__()
assert split in ['train', 'val']
self.root = root
self.transform = transform
self.n_classes = len(self.CLASSES)
self.ignore_label = 255
self.modals = modals
self.label_map = np.arange(256)
for id, trainid in self.ID2TRAINID.items():
self.label_map[id] = trainid
self.files = self._get_file_names(split)
# --- debug
# self.files = self._get_file_names(split)[:100]
if not self.files:
raise Exception(f"No images found in {img_path}")
print(f"Found {len(self.files)} {split} images.")
def __len__(self) -> int:
return len(self.files)
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
item_name = str(self.files[index])
rgb = os.path.join(self.root, item_name)
x1 = os.path.join(self.root, item_name.replace('data_2d_raw', 'data_2d_hha'))
x2 = os.path.join(self.root, item_name.replace('data_2d_raw', 'data_2d_lidar'))
x2 = x2.replace('.png', '_color.png')
x3 = os.path.join(self.root, item_name.replace('data_2d_raw', 'data_2d_event'))
x3 = x3.replace('/image_00/data_rect/', '/').replace('.png', '_event_image.png')
lbl_path = os.path.join(*[self.root, item_name.replace('data_2d_raw', 'data_2d_semantics/train').replace('data_rect', 'semantic')])
sample = {}
sample['img'] = io.read_image(rgb)[:3, ...]
if 'depth' in self.modals:
sample['depth'] = self._open_img(x1)
if 'lidar' in self.modals:
sample['lidar'] = self._open_img(x2)
if 'event' in self.modals:
sample['event'] = self._open_img(x3)
label = io.read_image(lbl_path)[0,...].unsqueeze(0)
sample['mask'] = label
if self.transform:
sample = self.transform(sample)
label = sample['mask']
del sample['mask']
label = self.encode(label.squeeze().numpy()).long()
sample = [sample[k] for k in self.modals]
return sample, label
def _open_img(self, file):
img = io.read_image(file)
C, H, W = img.shape
if C == 4:
img = img[:3, ...]
if C == 1:
img = img.repeat(3, 1, 1)
return img
def encode(self, label: Tensor) -> Tensor:
label = self.label_map[label]
return torch.from_numpy(label)
def _get_file_names(self, split_name):
assert split_name in ['train', 'val']
source = os.path.join(self.root, '{}.txt'.format(split_name))
file_names = []
with open(source) as f:
files = f.readlines()
for item in files:
file_name = item.strip()
if ' ' in file_name:
# --- KITTI-360
file_name = file_name.split(' ')[0]
file_names.append(file_name)
return file_names
if __name__ == '__main__':
traintransform = get_train_augmentation((376, 1408), seg_fill=255)
trainset = KITTI360(transform=traintransform)
trainloader = DataLoader(trainset, batch_size=2, num_workers=2, drop_last=True, pin_memory=False)
for i, (sample, lbl) in enumerate(trainloader):
print(torch.unique(lbl))
================================================
FILE: semseg/datasets/mcubes.py
================================================
import os
import torch
import numpy as np
from torch import Tensor
from torch.utils.data import Dataset
from torchvision import io
from torchvision import transforms
from pathlib import Path
from typing import Tuple
import glob
import einops
from torch.utils.data import DataLoader
from torch.utils.data import DistributedSampler, RandomSampler
from semseg.augmentations_mm import get_train_augmentation
import cv2
import random
from PIL import Image, ImageOps, ImageFilter
class MCubeS(Dataset):
"""
num_classes: 20
"""
CLASSES = ['asphalt','concrete','metal','road_marking','fabric','glass','plaster','plastic','rubber','sand',
'gravel','ceramic','cobblestone','brick','grass','wood','leaf','water','human','sky',]
PALETTE = torch.tensor([[ 44, 160, 44],
[ 31, 119, 180],
[255, 127, 14],
[214, 39, 40],
[140, 86, 75],
[127, 127, 127],
[188, 189, 34],
[255, 152, 150],
[ 23, 190, 207],
[174, 199, 232],
[196, 156, 148],
[197, 176, 213],
[247, 182, 210],
[199, 199, 199],
[219, 219, 141],
[158, 218, 229],
[ 57, 59, 121],
[107, 110, 207],
[156, 158, 222],
[ 99, 121, 57]])
def __init__(self, root: str = 'data/MCubeS/multimodal_dataset', split: str = 'train', transform = None, modals = ['image', 'aolp', 'dolp', 'nir'], case = None) -> None:
super().__init__()
assert split in ['train', 'val']
self.split = split
self.root = root
self.transform = transform
self.n_classes = len(self.CLASSES)
self.ignore_label = 255
self.modals = modals
self._left_offset = 192
self.img_h = 1024
self.img_w = 1224
max_dim = max(self.img_h, self.img_w)
u_vec = (np.arange(self.img_w)-self.img_w/2)/max_dim*2
v_vec = (np.arange(self.img_h)-self.img_h/2)/max_dim*2
self.u_map, self.v_map = np.meshgrid(u_vec, v_vec)
self.u_map = self.u_map[:,:self._left_offset]
self.base_size = 512
self.crop_size = 512
self.files = self._get_file_names(split)
if not self.files:
raise Exception(f"No images found in {img_path}")
print(f"Found {len(self.files)} {split} images.")
def __len__(self) -> int:
return len(self.files)
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
item_name = str(self.files[index])
rgb = os.path.join(*[self.root, 'polL_color', item_name+'.png'])
x1 = os.path.join(*[self.root, 'polL_aolp_sin', item_name+'.npy'])
x1_1 = os.path.join(*[self.root, 'polL_aolp_cos', item_name+'.npy'])
x2 = os.path.join(*[self.root, 'polL_dolp', item_name+'.npy'])
x3 = os.path.join(*[self.root, 'NIR_warped', item_name+'.png'])
lbl_path = os.path.join(*[self.root, 'GT', item_name+'.png'])
nir_mask = os.path.join(*[self.root, 'NIR_warped_mask', item_name+'.png'])
_mask = os.path.join(*[self.root, 'SS', item_name+'.png'])
_img = cv2.imread(rgb,-1)[:,:,::-1]
_img = _img.astype(np.float32)/65535 if _img.dtype==np.uint16 else _img.astype(np.float32)/255
_target = cv2.imread(lbl_path,-1)
_mask = cv2.imread(_mask,-1)
_aolp_sin = np.load(x1)
_aolp_cos = np.load(x1_1)
_aolp = np.stack([_aolp_sin, _aolp_cos, _aolp_sin], axis=2) # H x W x 3
dolp = np.load(x2)
_dolp = np.stack([dolp, dolp, dolp], axis=2) # H x W x 3
nir = cv2.imread(x3,-1)
nir = nir.astype(np.float32)/65535 if nir.dtype==np.uint16 else nir.astype(np.float32)/255
_nir = np.stack([nir, nir, nir], axis=2) # H x W x 3
_nir_mask = cv2.imread(nir_mask,0)
_img, _target, _aolp, _dolp, _nir, _nir_mask, _mask = _img[:,self._left_offset:], _target[:,self._left_offset:], \
_aolp[:,self._left_offset:], _dolp[:,self._left_offset:], \
_nir[:,self._left_offset:], _nir_mask[:,self._left_offset:], _mask[:,self._left_offset:]
sample = {'image': _img, 'label': _target, 'aolp': _aolp, 'dolp': _dolp, 'nir': _nir, 'nir_mask': _nir_mask, 'u_map': self.u_map, 'v_map': self.v_map, 'mask':_mask}
if self.split == "train":
sample = self.transform_tr(sample)
elif self.split == 'val':
sample = self.transform_val(sample)
elif self.split == 'test':
sample = self.transform_val(sample)
else:
raise NotImplementedError()
label = sample['label'].long()
sample = [sample[k] for k in self.modals]
return sample, label
def transform_tr(self, sample):
composed_transforms = transforms.Compose([
RandomHorizontalFlip(),
RandomScaleCrop(base_size=self.base_size, crop_size=self.crop_size, fill=255),
RandomGaussianBlur(),
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensor()])
return composed_transforms(sample)
def transform_val(self, sample):
composed_transforms = transforms.Compose([
FixScaleCrop(crop_size=1024),
Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensor()])
return composed_transforms(sample)
def _get_file_names(self, split_name):
assert split_name in ['train', 'val']
source = os.path.join(self.root, 'list_folder/test.txt') if split_name == 'val' else os.path.join(self.root, 'list_folder/train.txt')
file_names = []
with open(source) as f:
files = f.readlines()
for item in files:
file_name = item.strip()
if ' ' in file_name:
# --- KITTI-360
file_name = file_name.split(' ')[0]
file_names.append(file_name)
return file_names
class Normalize(object):
"""Normalize a tensor image with mean and standard deviation.
Args:
mean (tuple): means for each channel.
std (tuple): standard deviations for each channel.
"""
def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
self.mean = mean
self.std = std
def __call__(self, sample):
img = sample['image']
mask = sample['label']
img = np.array(img).astype(np.float32)
mask = np.array(mask).astype(np.float32)
img -= self.mean
img /= self.std
nir = sample['nir']
nir = np.array(nir).astype(np.float32)
# nir /= 255
return {'image': img,
'label': mask,
'aolp' : sample['aolp'],
'dolp' : sample['dolp'],
'nir' : nir,
'nir_mask': sample['nir_mask'],
'u_map': sample['u_map'],
'v_map': sample['v_map'],
'mask':sample['mask']}
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
img = sample['image']
mask = sample['label']
aolp = sample['aolp']
dolp = sample['dolp']
nir = sample['nir']
nir_mask = sample['nir_mask']
SS=sample['mask']
img = np.array(img).astype(np.float32).transpose((2, 0, 1))
mask = np.array(mask).astype(np.float32)
aolp = np.array(aolp).astype(np.float32).transpose((2, 0, 1))
dolp = np.array(dolp).astype(np.float32).transpose((2, 0, 1))
SS = np.array(SS).astype(np.float32)
nir = np.array(nir).astype(np.float32).transpose((2, 0, 1))
nir_mask = np.array(nir_mask).astype(np.float32)
img = torch.from_numpy(img).float()
mask = torch.from_numpy(mask).float()
aolp = torch.from_numpy(aolp).float()
dolp = torch.from_numpy(dolp).float()
SS = torch.from_numpy(SS).float()
nir = torch.from_numpy(nir).float()
nir_mask = torch.from_numpy(nir_mask).float()
u_map = sample['u_map']
v_map = sample['v_map']
u_map = torch.from_numpy(u_map.astype(np.float32)).float()
v_map = torch.from_numpy(v_map.astype(np.float32)).float()
return {'image': img,
'label': mask,
'aolp' : aolp,
'dolp' : dolp,
'nir' : nir,
'nir_mask' : nir_mask,
'u_map': u_map,
'v_map': v_map,
'mask':SS}
class RandomHorizontalFlip(object):
def __call__(self, sample):
img = sample['image']
mask = sample['label']
aolp = sample['aolp']
dolp = sample['dolp']
nir = sample['nir']
nir_mask = sample['nir_mask']
u_map = sample['u_map']
v_map = sample['v_map']
SS=sample['mask']
if random.random() < 0.5:
# img = img.transpose(Image.FLIP_LEFT_RIGHT)
# mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
# nir = nir.transpose(Image.FLIP_LEFT_RIGHT)
img = img[:,::-1]
mask = mask[:,::-1]
nir = nir[:,::-1]
nir_mask = nir_mask[:,::-1]
aolp = aolp[:,::-1]
dolp = dolp[:,::-1]
SS = SS[:,::-1]
u_map = u_map[:,::-1]
return {'image': img,
'label': mask,
'aolp' : aolp,
'dolp' : dolp,
'nir' : nir,
'nir_mask' : nir_mask,
'u_map': u_map,
'v_map': v_map,
'mask':SS}
class RandomGaussianBlur(object):
def __call__(self, sample):
img = sample['image']
mask = sample['label']
nir = sample['nir']
if random.random() < 0.5:
radius = random.random()
# img = img.filter(ImageFilter.GaussianBlur(radius=radius))
# nir = nir.filter(ImageFilter.GaussianBlur(radius=radius))
img = cv2.GaussianBlur(img, (0,0), radius)
nir = cv2.GaussianBlur(nir, (0,0), radius)
return {'image': img,
'label': mask,
'aolp' : sample['aolp'],
'dolp' : sample['dolp'],
'nir' : nir,
'nir_mask': sample['nir_mask'],
'u_map': sample['u_map'],
'v_map': sample['v_map'],
'mask':sample['mask']}
class RandomScaleCrop(object):
def __init__(self, base_size, crop_size, fill=255):
self.base_size = base_size
self.crop_size = crop_size
self.fill = fill
def __call__(self, sample):
img = sample['image']
mask = sample['label']
aolp = sample['aolp']
dolp = sample['dolp']
nir = sample['nir']
nir_mask = sample['nir_mask']
SS=sample['mask']
# random scale (short edge)
short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
# w, h = img.size
h, w = img.shape[:2]
if h > w:
ow = short_size
oh = int(1.0 * h * ow / w)
else:
oh = short_size
ow = int(1.0 * w * oh / h)
# pad crop
if short_size < self.crop_size:
padh = self.crop_size - oh if oh < self.crop_size else 0
padw = self.crop_size - ow if ow < self.crop_size else 0
# random crop crop_size
# w, h = img.size
h, w = img.shape[:2]
# x1 = random.randint(0, w - self.crop_size)
# y1 = random.randint(0, h - self.crop_size)
x1 = random.randint(0, max(0, ow - self.crop_size))
y1 = random.randint(0, max(0, oh - self.crop_size))
u_map = sample['u_map']
v_map = sample['v_map']
u_map = cv2.resize(u_map,(ow,oh))
v_map = cv2.resize(v_map,(ow,oh))
aolp = cv2.resize(aolp ,(ow,oh))
dolp = cv2.resize(dolp ,(ow,oh))
SS = cv2.resize(SS ,(ow,oh))
img = cv2.resize(img ,(ow,oh), interpolation=cv2.INTER_LINEAR)
mask = cv2.resize(mask ,(ow,oh), interpolation=cv2.INTER_NEAREST)
nir = cv2.resize(nir ,(ow,oh), interpolation=cv2.INTER_LINEAR)
nir_mask = cv2.resize(nir_mask ,(ow,oh), interpolation=cv2.INTER_NEAREST)
if short_size < self.crop_size:
u_map_ = np.zeros((oh+padh,ow+padw))
u_map_[:oh,:ow] = u_map
u_map = u_map_
v_map_ = np.zeros((oh+padh,ow+padw))
v_map_[:oh,:ow] = v_map
v_map = v_map_
aolp_ = np.zeros((oh+padh,ow+padw,3))
aolp_[:oh,:ow] = aolp
aolp = aolp_
dolp_ = np.zeros((oh+padh,ow+padw,3))
dolp_[:oh,:ow] = dolp
dolp = dolp_
img_ = np.zeros((oh+padh,ow+padw,3))
img_[:oh,:ow] = img
img = img_
SS_ = np.zeros((oh+padh,ow+padw))
SS_[:oh,:ow] = SS
SS = SS_
mask_ = np.full((oh+padh,ow+padw),self.fill)
mask_[:oh,:ow] = mask
mask = mask_
nir_ = np.zeros((oh+padh,ow+padw,3))
nir_[:oh,:ow] = nir
nir = nir_
nir_mask_ = np.zeros((oh+padh,ow+padw))
nir_mask_[:oh,:ow] = nir_mask
nir_mask = nir_mask_
u_map = u_map[y1:y1+self.crop_size, x1:x1+self.crop_size]
v_map = v_map[y1:y1+self.crop_size, x1:x1+self.crop_size]
aolp = aolp[y1:y1+self.crop_size, x1:x1+self.crop_size]
dolp = dolp[y1:y1+self.crop_size, x1:x1+self.crop_size]
img = img[y1:y1+self.crop_size, x1:x1+self.crop_size]
mask = mask[y1:y1+self.crop_size, x1:x1+self.crop_size]
nir = nir[y1:y1+self.crop_size, x1:x1+self.crop_size]
SS = SS[y1:y1+self.crop_size, x1:x1+self.crop_size]
nir_mask = nir_mask[y1:y1+self.crop_size, x1:x1+self.crop_size]
return {'image': img,
'label': mask,
'aolp' : aolp,
'dolp' : dolp,
'nir' : nir,
'nir_mask' : nir_mask,
'u_map': u_map,
'v_map': v_map,
'mask':SS}
class FixScaleCrop(object):
def __init__(self, crop_size):
self.crop_size = crop_size
def __call__(self, sample):
img = sample['image']
mask = sample['label']
aolp = sample['aolp']
dolp = sample['dolp']
nir = sample['nir']
nir_mask = sample['nir_mask']
SS = sample['mask']
# w, h = img.size
h, w = img.shape[:2]
if w > h:
oh = self.crop_size
ow = int(1.0 * w * oh / h)
else:
ow = self.crop_size
oh = int(1.0 * h * ow / w)
# img = img.resize((ow, oh), Image.BILINEAR)
# mask = mask.resize((ow, oh), Image.NEAREST)
# nir = nir.resize((ow, oh), Image.BILINEAR)
# center crop
# w, h = img.size
# h, w = img.shape[:2]
x1 = int(round((ow - self.crop_size) / 2.))
y1 = int(round((oh - self.crop_size) / 2.))
# img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
# mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
# nir = nir.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
u_map = sample['u_map']
v_map = sample['v_map']
u_map = cv2.resize(u_map,(ow,oh))
v_map = cv2.resize(v_map,(ow,oh))
aolp = cv2.resize(aolp ,(ow,oh))
dolp = cv2.resize(dolp ,(ow,oh))
SS = cv2.resize(SS ,(ow,oh))
img = cv2.resize(img ,(ow,oh), interpolation=cv2.INTER_LINEAR)
mask = cv2.resize(mask ,(ow,oh), interpolation=cv2.INTER_NEAREST)
nir = cv2.resize(nir ,(ow,oh), interpolation=cv2.INTER_LINEAR)
nir_mask = cv2.resize(nir_mask,(ow,oh), interpolation=cv2.INTER_NEAREST)
u_map = u_map[y1:y1+self.crop_size, x1:x1+self.crop_size]
v_map = v_map[y1:y1+self.crop_size, x1:x1+self.crop_size]
aolp = aolp[y1:y1+self.crop_size, x1:x1+self.crop_size]
dolp = dolp[y1:y1+self.crop_size, x1:x1+self.crop_size]
img = img[y1:y1+self.crop_size, x1:x1+self.crop_size]
mask = mask[y1:y1+self.crop_size, x1:x1+self.crop_size]
SS = SS[y1:y1+self.crop_size, x1:x1+self.crop_size]
nir = nir[y1:y1+self.crop_size, x1:x1+self.crop_size]
nir_mask = nir_mask[y1:y1+self.crop_size, x1:x1+self.crop_size]
return {'image': img,
'label': mask,
'aolp' : aolp,
'dolp' : dolp,
'nir' : nir,
'nir_mask' : nir_mask,
'u_map': u_map,
'v_map': v_map,
'mask':SS}
if __name__ == '__main__':
traintransform = get_train_augmentation((1024, 1224), seg_fill=255)
trainset = MCubeS(transform=traintransform, split='val')
trainloader = DataLoader(trainset, batch_size=1, num_workers=0, drop_last=False, pin_memory=False)
for i, (sample, lbl) in enumerate(trainloader):
print(torch.unique(lbl))
================================================
FILE: semseg/datasets/mfnet.py
================================================
import os
import torch
import numpy as np
from torch import Tensor
from torch.utils.data import Dataset
from torchvision import io
from pathlib import Path
from typing import Tuple
import glob
import einops
from torch.utils.data import DataLoader
from torch.utils.data import DistributedSampler, RandomSampler
from semseg.augmentations_mm import get_train_augmentation
class MFNet(Dataset):
"""
num_classes: 9
"""
CLASSES = ['unlabeled', 'car', 'person', 'bike', 'curve', 'car_stop', 'guardrail', 'color_cone', 'bump']
PALETTE = torch.tensor([[64,0,128],[64,64,0],[0,128,192],[0,0,192],[128,128,0],[64,64,128],[192,128,128],[192,64,0]])
def __init__(self, root: str = 'data/MFNet', split: str = 'train', transform = None, modals = ['img', 'thermal'], case = None) -> None:
super().__init__()
assert split in ['train', 'val']
self.root = root
self.transform = transform
self.n_classes = len(self.CLASSES)
self.ignore_label = 255
self.modals = modals
self.files = self._get_file_names(split)
if not self.files:
raise Exception(f"No images found in {img_path}")
print(f"Found {len(self.files)} {split} images.")
def __len__(self) -> int:
return len(self.files)
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
item_name = str(self.files[index])
rgb = os.path.join(*[self.root, 'rgb', item_name+'.jpg'])
x1 = os.path.join(*[self.root, 'ther', item_name+'.jpg'])
lbl_path = os.path.join(*[self.root, 'labels', item_name+'.png'])
sample = {}
sample['img'] = io.read_image(rgb)[:3, ...]
if 'thermal' in self.modals:
sample['thermal'] = self._open_img(x1)
label = io.read_image(lbl_path)[0,...].unsqueeze(0)
sample['mask'] = label
if self.transform:
sample = self.transform(sample)
label = sample['mask']
del sample['mask']
label = self.encode(label.squeeze().numpy()).long()
sample = [sample[k] for k in self.modals]
return sample, label
def _open_img(self, file):
img = io.read_image(file)
C, H, W = img.shape
if C == 4:
img = img[:3, ...]
if C == 1:
img = img.repeat(3, 1, 1)
return img
def encode(self, label: Tensor) -> Tensor:
return torch.from_numpy(label)
def _get_file_names(self, split_name):
assert split_name in ['train', 'val']
source = os.path.join(self.root, 'test.txt') if split_name == 'val' else os.path.join(self.root, 'train.txt')
file_names = []
with open(source) as f:
files = f.readlines()
for item in files:
file_name = item.strip()
if ' ' in file_name:
file_name = file_name.split(' ')[0]
file_names.append(file_name)
return file_names
if __name__ == '__main__':
traintransform = get_train_augmentation((480, 640), seg_fill=255)
trainset = MFNet(transform=traintransform)
trainloader = DataLoader(trainset, batch_size=2, num_workers=2, drop_last=True, pin_memory=False)
for i, (sample, lbl) in enumerate(trainloader):
print(torch.unique(lbl))
================================================
FILE: semseg/datasets/nyu.py
================================================
import os
import torch
import numpy as np
from torch import Tensor
from torch.utils.data import Dataset
import torchvision.transforms.functional as TF
from torchvision import io
from pathlib import Path
from typing import Tuple
import glob
import einops
from torch.utils.data import DataLoader
from torch.utils.data import DistributedSampler, RandomSampler
from semseg.augmentations_mm import get_train_augmentation
class NYU(Dataset):
"""
num_classes: 40
"""
CLASSES = ['wall','floor','cabinet','bed','chair','sofa','table','door','window','bookshelf','picture','counter','blinds',
'desk','shelves','curtain','dresser','pillow','mirror','floor mat','clothes','ceiling','books','refridgerator',
'television','paper','towel','shower curtain','box','whiteboard','person','night stand','toilet',
'sink','lamp','bathtub','bag','otherstructure','otherfurniture','otherprop']
PALETTE = None
def __init__(self, root: str = 'data/NYUDepthv2', split: str = 'train', transform = None, modals = ['img', 'depth'], case = None) -> None:
super().__init__()
assert split in ['train', 'val']
self.root = root
self.transform = transform
self.n_classes = len(self.CLASSES)
self.ignore_label = 255
self.modals = modals
self.files = self._get_file_names(split)
if not self.files:
raise Exception(f"No images found in {img_path}")
print(f"Found {len(self.files)} {split} images.")
def __len__(self) -> int:
return len(self.files)
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
item_name = str(self.files[index])
rgb = os.path.join(*[self.root, 'RGB', item_name+'.jpg'])
x1 = os.path.join(*[self.root, 'HHA', item_name+'.jpg'])
lbl_path = os.path.join(*[self.root, 'Label', item_name+'.png'])
sample = {}
sample['img'] = io.read_image(rgb)[:3, ...]
if 'depth' in self.modals:
sample['depth'] = self._open_img(x1)
if 'lidar' in self.modals:
raise NotImplementedError()
if 'event' in self.modals:
raise NotImplementedError()
label = io.read_image(lbl_path)[0,...].unsqueeze(0)
label[label==255] = 0
label -= 1
sample['mask'] = label
if self.transform:
sample = self.transform(sample)
label = sample['mask']
del sample['mask']
label = self.encode(label.squeeze().numpy()).long()
sample = [sample[k] for k in self.modals]
return sample, label
def _open_img(self, file):
img = io.read_image(file)
C, H, W = img.shape
if C == 4:
img = img[:3, ...]
if C == 1:
img = img.repeat(3, 1, 1)
return img
def encode(self, label: Tensor) -> Tensor:
return torch.from_numpy(label)
def _get_file_names(self, split_name):
assert split_name in ['train', 'val']
source = os.path.join(self.root, 'test.txt') if split_name == 'val' else os.path.join(self.root, 'train.txt')
file_names = []
with open(source) as f:
files = f.readlines()
for item in files:
file_name = item.strip()
if ' ' in file_name:
file_name = file_name.split(' ')[0]
file_names.append(file_name)
return file_names
if __name__ == '__main__':
traintransform = get_train_augmentation((480, 640), seg_fill=255)
trainset = NYU(transform=traintransform, split='val')
trainloader = DataLoader(trainset, batch_size=2, num_workers=2, drop_last=True, pin_memory=False)
for i, (sample, lbl) in enumerate(trainloader):
print(torch.unique(lbl))
================================================
FILE: semseg/datasets/unzip.py
================================================
import zipfile
with zipfile.ZipFile("data/MCubeS/multimodal_dataset.zip", "r") as zip_ref:
for name in zip_ref.namelist():
try:
zip_ref.extract(name, "multimodal_dataset_extracted/")
except zipfile.BadZipFile as e:
print(e)
================================================
FILE: semseg/datasets/urbanlf.py
================================================
import os
import torch
import numpy as np
from torch import Tensor
from torch.utils.data import Dataset
import torchvision.transforms.functional as TF
from torchvision import io
from pathlib import Path
from typing import Tuple
import glob
import einops
from torch.utils.data import DataLoader
from torch.utils.data import DistributedSampler, RandomSampler
from semseg.augmentations_mm import get_train_augmentation
class UrbanLF(Dataset):
"""
num_classes: 14
"""
CLASSES = ['bike','building','fence','others','person','pole','road','sidewalk','traffic sign','vegetation','vehicle','bridge','rider','sky']
PALETTE = [[168,198,168],[198,0,0],[202,154,198],[0,0,0],[100,198,198],[198,100,0],[52,42,198],[154,52,192],[198,0,168],[0,198,0],[198,186,90],[108,107,161],[156,200,26],[158,179,202]]
def __init__(self, root: str = 'data/UrBanLF/Syn', split: str = 'train', transform = None, modals = ['img', '5_1', '5_2', '5_3', '5_4', '5_6', '5_7', '5_8', '5_9'], case = None) -> None:
super().__init__()
assert split in ['train', 'val']
self.root = root
self.transform = transform
self.n_classes = len(self.CLASSES)
self.ignore_label = 255
self.modals = modals
self.files = sorted(glob.glob(os.path.join(*[root, split, '*', '5_5.png'])))
if not self.files:
raise Exception(f"No images found in {img_path}")
print(f"Found {len(self.files)} {split} images.")
def __len__(self) -> int:
return len(self.files)
def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
item_name = str(self.files[index])
rgb = item_name
rgb_dir_name = os.path.dirname(rgb)
lf_names = []
lf_paths = []
for i in range(1, 10):
for j in range(1, 10):
lf_name = '{}_{}'.format(i, j)
if lf_name != '5_5':
if lf_name in self.modals:
lf_names.append(lf_name)
lf_paths.append(os.path.join(rgb_dir_name, lf_name+'.png'))
if 'real' in self.root:
lbl_path = item_name.replace('5_5', 'label')
elif 'Syn' in self.root:
lbl_path = item_name.replace('5_5.png', '5_5_label.npy')
else:
raise NotImplemented
sample = {}
sample['img'] = io.read_image(rgb)[:3, ...]
if len(self.modals) > 1:
for i, lf_name in enumerate(lf_names):
assert lf_name in lf_paths[i], "Not matched."
sample[lf_name] = self._open_img(lf_paths[i])
if 'real' in self.root:
label = io.read_image(lbl_path)
label = self.encode(label.numpy())
elif 'Syn' in self.root:
label = np.load(lbl_path)
label[label==255] = 0
label -= 1
label = torch.tensor(label[None,...])
else:
raise NotImplemented
sample['mask'] = label
if self.transform:
sample = self.transform(sample)
label = sample['mask']
del sample['mask']
label = label.long().squeeze(0)
sample_list = [sample['img']]
sample_list += [sample[k] for k in lf_names]
return sample_list, label
def _open_img(self, file):
img = io.read_image(file)
C, H, W = img.shape
if C == 4:
img = img[:3, ...]
if C == 1:
img = img.repeat(3, 1, 1)
return img
def encode(self, label: Tensor) -> Tensor:
label = label.transpose(1,2,0) # C, H, W -> H, W, C
label_mask = np.zeros((label.shape[0], label.shape[1]), dtype=np.int16)
for ii, lb in enumerate(self.PALETTE):
label_mask[np.where(np.all(label == lb, axis=-1))[:2]] = ii
label_mask = label_mask[None,...].astype(int)
return torch.from_numpy(label_mask)
if __name__ == '__main__':
traintransform = get_train_augmentation((432, 623), seg_fill=255)
trainset = UrbanLF(transform=traintransform, modals=['img', '1_2'])
trainloader = DataLoader(trainset, batch_size=2, num_workers=2, drop_last=True, pin_memory=False)
for i, (sample, lbl) in enumerate(trainloader):
print(torch.unique(lbl))
================================================
FILE: semseg/losses.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
class CrossEntropy(nn.Module):
def __init__(self, ignore_label: int = 255, weight: Tensor = None, aux_weights: list = [1, 0.4, 0.4]) -> None:
super().__init__()
self.aux_weights = aux_weights
self.criterion = nn.CrossEntropyLoss(weight=weight, ignore_index=ignore_label)
def _forward(self, preds: Tensor, labels: Tensor) -> Tensor:
# preds in shape [B, C, H, W] and labels in shape [B, H, W]
return self.criterion(preds, labels)
def forward(self, preds, labels: Tensor) -> Tensor:
if isinstance(preds, tuple):
return sum([w * self._forward(pred, labels) for (pred, w) in zip(preds, self.aux_weights)])
return self._forward(preds, labels)
class OhemCrossEntropy(nn.Module):
def __init__(self, ignore_label: int = 255, weight: Tensor = None, thresh: float = 0.7, aux_weights: list = [1, 1]) -> None:
super().__init__()
self.ignore_label = ignore_label
self.aux_weights = aux_weights
self.thresh = -torch.log(torch.tensor(thresh, dtype=torch.float))
self.criterion = nn.CrossEntropyLoss(weight=weight, ignore_index=ignore_label, reduction='none')
def _forward(self, preds: Tensor, labels: Tensor) -> Tensor:
# preds in shape [B, C, H, W] and labels in shape [B, H, W]
n_min = labels[labels != self.ignore_label].numel() // 16
loss = self.criterion(preds, labels).view(-1)
loss_hard = loss[loss > self.thresh]
if loss_hard.numel() < n_min:
loss_hard, _ = loss.topk(n_min)
return torch.mean(loss_hard)
def forward(self, preds, labels: Tensor) -> Tensor:
if isinstance(preds, tuple):
return sum([w * self._forward(pred, labels) for (pred, w) in zip(preds, self.aux_weights)])
return self._forward(preds, labels)
class Dice(nn.Module):
def __init__(self, delta: float = 0.5, aux_weights: list = [1, 0.4, 0.4]):
"""
delta: Controls weight given to FP and FN. This equals to dice score when delta=0.5
"""
super().__init__()
self.delta = delta
self.aux_weights = aux_weights
def _forward(self, preds: Tensor, labels: Tensor) -> Tensor:
# preds in shape [B, C, H, W] and labels in shape [B, H, W]
num_classes = preds.shape[1]
labels = F.one_hot(labels, num_classes).permute(0, 3, 1, 2)
tp = torch.sum(labels*preds, dim=(2, 3))
fn = torch.sum(labels*(1-preds), dim=(2, 3))
fp = torch.sum((1-labels)*preds, dim=(2, 3))
dice_score = (tp + 1e-6) / (tp + self.delta * fn + (1 - self.delta) * fp + 1e-6)
dice_score = torch.sum(1 - dice_score, dim=-1)
dice_score = dice_score / num_classes
return dice_score.mean()
def forward(self, preds, targets: Tensor) -> Tensor:
if isinstance(preds, tuple):
return sum([w * self._forward(pred, targets) for (pred, w) in zip(preds, self.aux_weights)])
return self._forward(preds, targets)
__all__ = ['CrossEntropy', 'OhemCrossEntropy', 'Dice']
def get_loss(loss_fn_name: str = 'CrossEntropy', ignore_label: int = 255, cls_weights: Tensor = None):
assert loss_fn_name in __all__, f"Unavailable loss function name >> {loss_fn_name}.\nAvailable loss functions: {__all__}"
if loss_fn_name == 'Dice':
return Dice()
return eval(loss_fn_name)(ignore_label, cls_weights)
if __name__ == '__main__':
pred = torch.randint(0, 19, (2, 19, 480, 640), dtype=torch.float)
label = torch.randint(0, 19, (2, 480, 640), dtype=torch.long)
loss_fn = Dice()
y = loss_fn(pred, label)
print(y)
================================================
FILE: semseg/metrics.py
================================================
import torch
from torch import Tensor
from typing import Tuple
class Metrics:
def __init__(self, num_classes: int, ignore_label: int, device) -> None:
self.ignore_label = ignore_label
self.num_classes = num_classes
self.hist = torch.zeros(num_classes, num_classes).to(device)
def update(self, pred: Tensor, target: Tensor) -> None:
pred = pred.argmax(dim=1)
keep = target != self.ignore_label
self.hist += torch.bincount(target[keep] * self.num_classes + pred[keep], minlength=self.num_classes**2).view(self.num_classes, self.num_classes)
def compute_iou(self) -> Tuple[Tensor, Tensor]:
ious = self.hist.diag() / (self.hist.sum(0) + self.hist.sum(1) - self.hist.diag())
ious[ious.isnan()]=0.
miou = ious.mean().item()
# miou = ious[~ious.isnan()].mean().item()
ious *= 100
miou *= 100
return ious.cpu().numpy().round(2).tolist(), round(miou, 2)
def compute_f1(self) -> Tuple[Tensor, Tensor]:
f1 = 2 * self.hist.diag() / (self.hist.sum(0) + self.hist.sum(1))
f1[f1.isnan()]=0.
mf1 = f1.mean().item()
# mf1 = f1[~f1.isnan()].mean().item()
f1 *= 100
mf1 *= 100
return f1.cpu().numpy().round(2).tolist(), round(mf1, 2)
def compute_pixel_acc(self) -> Tuple[Tensor, Tensor]:
acc = self.hist.diag() / self.hist.sum(1)
acc[acc.isnan()]=0.
macc = acc.mean().item()
# macc = acc[~acc.isnan()].mean().item()
acc *= 100
macc *= 100
return acc.cpu().numpy().round(2).tolist(), round(macc, 2)
================================================
FILE: semseg/models/__init__.py
================================================
from .cmx import CMX
from .cmnext import CMNeXt
__all__ = [
'CMX',
'CMNeXt',
]
================================================
FILE: semseg/models/backbones/__init__.py
================================================
from .cmx import CMX
from .cmnext import CMNeXt
__all__ = [
'CMX',
'CMNeXt',
]
================================================
FILE: semseg/models/backbones/cmnext.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from semseg.models.layers import DropPath
import functools
from functools import partial
from fvcore.nn import flop_count_table, FlopCountAnalysis
from semseg.models.modules.ffm import FeatureFusionModule as FFM
from semseg.models.modules.ffm import FeatureRectifyModule as FRM
from semseg.models.modules.ffm import ChannelEmbed
from semseg.models.modules.mspa import MSPABlock
from semseg.utils.utils import nchw_to_nlc, nlc_to_nchw
class Attention(nn.Module):
def __init__(self, dim, head, sr_ratio):
super().__init__()
self.head = head
self.sr_ratio = sr_ratio
self.scale = (dim // head) ** -0.5
self.q = nn.Linear(dim, dim)
self.kv = nn.Linear(dim, dim*2)
self.proj = nn.Linear(dim, dim)
if sr_ratio > 1:
self.sr = nn.Conv2d(dim, dim, sr_ratio, sr_ratio)
self.norm = nn.LayerNorm(dim)
def forward(self, x: Tensor, H, W) -> Tensor:
B, N, C = x.shape
q = self.q(x).reshape(B, N, self.head, C // self.head).permute(0, 2, 1, 3)
if self.sr_ratio > 1:
x = x.permute(0, 2, 1).reshape(B, C, H, W)
x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1)
x = self.norm(x)
k, v = self.kv(x).reshape(B, -1, 2, self.head, C // self.head).permute(2, 0, 3, 1, 4)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
return x
class DWConv(nn.Module):
def __init__(self, dim):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, groups=dim)
def forward(self, x: Tensor, H, W) -> Tensor:
B, _, C = x.shape
x = x.transpose(1, 2).view(B, C, H, W)
x = self.dwconv(x)
return x.flatten(2).transpose(1, 2)
class MLP(nn.Module):
def __init__(self, c1, c2):
super().__init__()
self.fc1 = nn.Linear(c1, c2)
self.dwconv = DWConv(c2)
self.fc2 = nn.Linear(c2, c1)
def forward(self, x: Tensor, H, W) -> Tensor:
return self.fc2(F.gelu(self.dwconv(self.fc1(x), H, W)))
class PatchEmbed(nn.Module):
def __init__(self, c1=3, c2=32, patch_size=7, stride=4, padding=0):
super().__init__()
self.proj = nn.Conv2d(c1, c2, patch_size, stride, padding) # padding=(ps[0]//2, ps[1]//2)
self.norm = nn.LayerNorm(c2)
def forward(self, x: Tensor) -> Tensor:
x = self.proj(x)
_, _, H, W = x.shape
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
return x, H, W
class PatchEmbedParallel(nn.Module):
def __init__(self, c1=3, c2=32, patch_size=7, stride=4, padding=0, num_modals=4):
super().__init__()
self.proj = ModuleParallel(nn.Conv2d(c1, c2, patch_size, stride, padding)) # padding=(ps[0]//2, ps[1]//2)
self.norm = LayerNormParallel(c2, num_modals)
def forward(self, x: list) -> list:
x = self.proj(x)
_, _, H, W = x[0].shape
x = self.norm(x)
return x, H, W
class Block(nn.Module):
def __init__(self, dim, head, sr_ratio=1, dpr=0., is_fan=False):
super().__init__()
self.norm1 = nn.LayerNorm(dim)
self.attn = Attention(dim, head, sr_ratio)
self.drop_path = DropPath(dpr) if dpr > 0. else nn.Identity()
self.norm2 = nn.LayerNorm(dim)
self.mlp = MLP(dim, int(dim*4)) if not is_fan else ChannelProcessing(dim, mlp_hidden_dim=int(dim*4))
def forward(self, x: Tensor, H, W) -> Tensor:
x = x + self.drop_path(self.attn(self.norm1(x), H, W))
x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
return x
class ChannelProcessing(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., drop_path=0., mlp_hidden_dim=None, norm_layer=nn.LayerNorm):
super().__init__()
assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
self.dim = dim
self.num_heads = num_heads
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.mlp_v = MLP(dim, mlp_hidden_dim)
self.norm_v = norm_layer(dim)
self.q = nn.Linear(dim, dim, bias=qkv_bias)
self.pool = nn.AdaptiveAvgPool2d((None, 1))
self.sigmoid = nn.Sigmoid()
def forward(self, x, H, W, atten=None):
B, N, C = x.shape
v = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
k = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
q = q.softmax(-2).transpose(-1,-2)
_, _, Nk, Ck = k.shape
k = k.softmax(-2)
k = torch.nn.functional.avg_pool2d(k, (1, Ck))
attn = self.sigmoid(q @ k)
Bv, Hd, Nv, Cv = v.shape
v = self.norm_v(self.mlp_v(v.transpose(1, 2).reshape(Bv, Nv, Hd*Cv), H, W)).reshape(Bv, Nv, Hd, Cv).transpose(1, 2)
x = (attn * v.transpose(-1, -2)).permute(0, 3, 1, 2).reshape(B, N, C)
return x
class PredictorConv(nn.Module):
def __init__(self, embed_dim=384, num_modals=4):
super().__init__()
self.num_modals = num_modals
self.score_nets = nn.ModuleList([nn.Sequential(
nn.Conv2d(embed_dim, embed_dim, 3, 1, 1, groups=(embed_dim)),
nn.Conv2d(embed_dim, 1, 1),
nn.Sigmoid()
)for _ in range(num_modals)])
def forward(self, x):
B, C, H, W = x[0].shape
x_ = [torch.zeros((B, 1, H, W)) for _ in range(self.num_modals)]
for i in range(self.num_modals):
x_[i] = self.score_nets[i](x[i])
return x_
class ModuleParallel(nn.Module):
def __init__(self, module):
super(ModuleParallel, self).__init__()
self.module = module
def forward(self, x_parallel):
return [self.module(x) for x in x_parallel]
class ConvLayerNorm(nn.Module):
"""Channel first layer norm
"""
def __init__(self, normalized_shape, eps=1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
def forward(self, x: Tensor) -> Tensor:
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class LayerNormParallel(nn.Module):
def __init__(self, num_features, num_modals=4):
super(LayerNormParallel, self).__init__()
# self.num_modals = num_modals
for i in range(num_modals):
setattr(self, 'ln_' + str(i), ConvLayerNorm(num_features, eps=1e-6))
def forward(self, x_parallel):
return [getattr(self, 'ln_' + str(i))(x) for i, x in enumerate(x_parallel)]
cmnext_settings = {
# 'B0': [[32, 64, 160, 256], [2, 2, 2, 2]],
# 'B1': [[64, 128, 320, 512], [2, 2, 2, 2]],
'B2': [[64, 128, 320, 512], [3, 4, 6, 3]],
# 'B3': [[64, 128, 320, 512], [3, 4, 18, 3]],
'B4': [[64, 128, 320, 512], [3, 8, 27, 3]],
'B5': [[64, 128, 320, 512], [3, 6, 40, 3]]
}
class CMNeXt(nn.Module):
def __init__(self, model_name: str = 'B0', modals: list = ['rgb', 'depth', 'event', 'lidar']):
super().__init__()
assert model_name in cmnext_settings.keys(), f"Model name should be in {list(cmnext_settings.keys())}"
embed_dims, depths = cmnext_settings[model_name]
extra_depths = depths
self.modals = modals[1:] if len(modals)>1 else []
self.num_modals = len(self.modals)
drop_path_rate = 0.1
self.channels = embed_dims
norm_cfg = dict(type='BN', requires_grad=True)
# patch_embed
self.patch_embed1 = PatchEmbed(3, embed_dims[0], 7, 4, 7//2)
self.patch_embed2 = PatchEmbed(embed_dims[0], embed_dims[1], 3, 2, 3//2)
self.patch_embed3 = PatchEmbed(embed_dims[1], embed_dims[2], 3, 2, 3//2)
self.patch_embed4 = PatchEmbed(embed_dims[2], embed_dims[3], 3, 2, 3//2)
if self.num_modals > 0:
self.extra_downsample_layers = nn.ModuleList([
PatchEmbedParallel(3, embed_dims[0], 7, 4, 7//2, self.num_modals),
*[PatchEmbedParallel(embed_dims[i], embed_dims[i+1], 3, 2, 3//2, self.num_modals) for i in range(3)]
])
if self.num_modals > 1:
self.extra_score_predictor = nn.ModuleList([PredictorConv(embed_dims[i], self.num_modals) for i in range(len(depths))])
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
cur = 0
self.block1 = nn.ModuleList([Block(embed_dims[0], 1, 8, dpr[cur+i]) for i in range(depths[0])])
self.norm1 = nn.LayerNorm(embed_dims[0])
if self.num_modals > 0:
self.extra_block1 = nn.ModuleList([MSPABlock(embed_dims[0], mlp_ratio=8, drop_path=dpr[cur+i], norm_cfg=norm_cfg) for i in range(extra_depths[0])]) # --- MSPABlock
self.extra_norm1 = ConvLayerNorm(embed_dims[0])
cur += depths[0]
self.block2 = nn.ModuleList([Block(embed_dims[1], 2, 4, dpr[cur+i]) for i in range(depths[1])])
self.norm2 = nn.LayerNorm(embed_dims[1])
if self.num_modals > 0:
self.extra_block2 = nn.ModuleList([MSPABlock(embed_dims[1], mlp_ratio=8, drop_path=dpr[cur+i], norm_cfg=norm_cfg) for i in range(extra_depths[1])])
self.extra_norm2 = ConvLayerNorm(embed_dims[1])
cur += depths[1]
self.block3 = nn.ModuleList([Block(embed_dims[2], 5, 2, dpr[cur+i]) for i in range(depths[2])])
self.norm3 = nn.LayerNorm(embed_dims[2])
if self.num_modals > 0:
self.extra_block3 = nn.ModuleList([MSPABlock(embed_dims[2], mlp_ratio=4, drop_path=dpr[cur+i], norm_cfg=norm_cfg) for i in range(extra_depths[2])])
self.extra_norm3 = ConvLayerNorm(embed_dims[2])
cur += depths[2]
self.block4 = nn.ModuleList([Block(embed_dims[3], 8, 1, dpr[cur+i]) for i in range(depths[3])])
self.norm4 = nn.LayerNorm(embed_dims[3])
if self.num_modals > 0:
self.extra_block4 = nn.ModuleList([MSPABlock(embed_dims[3], mlp_ratio=4, drop_path=dpr[cur+i], norm_cfg=norm_cfg) for i in range(extra_depths[3])])
self.extra_norm4 = ConvLayerNorm(embed_dims[3])
if self.num_modals > 0:
num_heads = [1,2,5,8]
self.FRMs = nn.ModuleList([
FRM(dim=embed_dims[0], reduction=1),
FRM(dim=embed_dims[1], reduction=1),
FRM(dim=embed_dims[2], reduction=1),
FRM(dim=embed_dims[3], reduction=1)])
self.FFMs = nn.ModuleList([
FFM(dim=embed_dims[0], reduction=1, num_heads=num_heads[0], norm_layer=nn.BatchNorm2d),
FFM(dim=embed_dims[1], reduction=1, num_heads=num_heads[1], norm_layer=nn.BatchNorm2d),
FFM(dim=embed_dims[2], reduction=1, num_heads=num_heads[2], norm_layer=nn.BatchNorm2d),
FFM(dim=embed_dims[3], reduction=1, num_heads=num_heads[3], norm_layer=nn.BatchNorm2d)])
def tokenselect(self, x_ext, module):
x_scores = module(x_ext)
for i in range(len(x_ext)):
x_ext[i] = x_scores[i] * x_ext[i] + x_ext[i]
x_f = functools.reduce(torch.max, x_ext)
return x_f
def forward(self, x: list) -> list:
x_cam = x[0]
if self.num_modals > 0:
x_ext = x[1:]
B = x_cam.shape[0]
outs = []
# stage 1
x_cam, H, W = self.patch_embed1(x_cam)
for blk in self.block1:
x_cam = blk(x_cam, H, W)
x1_cam = self.norm1(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext, _, _ = self.extra_downsample_layers[0](x_ext)
x_f = self.tokenselect(x_ext, self.extra_score_predictor[0]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block1:
x_f = blk(x_f)
x1_f = self.extra_norm1(x_f)
x1_cam, x1_f = self.FRMs[0](x1_cam, x1_f)
x_fused = self.FFMs[0](x1_cam, x1_f)
outs.append(x_fused)
x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x1_f for x_ in x_ext] if self.num_modals > 1 else [x1_f]
else:
outs.append(x1_cam)
# stage 2
x_cam, H, W = self.patch_embed2(x1_cam)
for blk in self.block2:
x_cam = blk(x_cam, H, W)
x2_cam = self.norm2(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext, _, _ = self.extra_downsample_layers[1](x_ext)
x_f = self.tokenselect(x_ext, self.extra_score_predictor[1]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block2:
x_f = blk(x_f)
x2_f = self.extra_norm2(x_f)
x2_cam, x2_f = self.FRMs[1](x2_cam, x2_f)
x_fused = self.FFMs[1](x2_cam, x2_f)
outs.append(x_fused)
x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x2_f for x_ in x_ext] if self.num_modals > 1 else [x2_f]
else:
outs.append(x2_cam)
# stage 3
x_cam, H, W = self.patch_embed3(x2_cam)
for blk in self.block3:
x_cam = blk(x_cam, H, W)
x3_cam = self.norm3(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext, _, _ = self.extra_downsample_layers[2](x_ext)
x_f = self.tokenselect(x_ext, self.extra_score_predictor[2]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block3:
x_f = blk(x_f)
x3_f = self.extra_norm3(x_f)
x3_cam, x3_f = self.FRMs[2](x3_cam, x3_f)
x_fused = self.FFMs[2](x3_cam, x3_f)
outs.append(x_fused)
x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x3_f for x_ in x_ext] if self.num_modals > 1 else [x3_f]
else:
outs.append(x3_cam)
# stage 4
x_cam, H, W = self.patch_embed4(x3_cam)
for blk in self.block4:
x_cam = blk(x_cam, H, W)
x4_cam = self.norm4(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext, _, _ = self.extra_downsample_layers[3](x_ext)
x_f = self.tokenselect(x_ext, self.extra_score_predictor[3]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block4:
x_f = blk(x_f)
x4_f = self.extra_norm4(x_f)
x4_cam, x4_f = self.FRMs[3](x4_cam, x4_f)
x_fused = self.FFMs[3](x4_cam, x4_f)
outs.append(x_fused)
else:
outs.append(x4_cam)
return outs
if __name__ == '__main__':
modals = ['img', 'depth', 'event', 'lidar']
x = [torch.zeros(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024)*2, torch.ones(1, 3, 1024, 1024) *3]
model = CMNeXt('B2', modals)
outs = model(x)
for y in outs:
print(y.shape)
================================================
FILE: semseg/models/backbones/cmx.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
import einops
from semseg.models.layers import DropPath
from semseg.models.modules.ffm import FeatureFusionModule as FFM
from semseg.models.modules.ffm import FeatureRectifyModule as FRM
class Attention(nn.Module):
def __init__(self, dim, head, sr_ratio):
super().__init__()
self.head = head
self.sr_ratio = sr_ratio
self.scale = (dim // head) ** -0.5
self.q = nn.Linear(dim, dim)
self.kv = nn.Linear(dim, dim*2)
self.proj = nn.Linear(dim, dim)
if sr_ratio > 1:
self.sr = nn.Conv2d(dim, dim, sr_ratio, sr_ratio)
self.norm = nn.LayerNorm(dim)
def forward(self, x: Tensor, H, W) -> Tensor:
B, N, C = x.shape
q = self.q(x).reshape(B, N, self.head, C // self.head).permute(0, 2, 1, 3)
if self.sr_ratio > 1:
x = x.permute(0, 2, 1).reshape(B, C, H, W)
x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1)
x = self.norm(x)
k, v = self.kv(x).reshape(B, -1, 2, self.head, C // self.head).permute(2, 0, 3, 1, 4)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
return x
class DWConv(nn.Module):
def __init__(self, dim):
super().__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, groups=dim)
def forward(self, x: Tensor, H, W) -> Tensor:
B, _, C = x.shape
x = x.transpose(1, 2).view(B, C, H, W)
x = self.dwconv(x)
return x.flatten(2).transpose(1, 2)
class MLP(nn.Module):
def __init__(self, c1, c2):
super().__init__()
self.fc1 = nn.Linear(c1, c2)
self.dwconv = DWConv(c2)
self.fc2 = nn.Linear(c2, c1)
def forward(self, x: Tensor, H, W) -> Tensor:
return self.fc2(F.gelu(self.dwconv(self.fc1(x), H, W)))
class PatchEmbed(nn.Module):
def __init__(self, c1=3, c2=32, patch_size=7, stride=4):
super().__init__()
self.proj = nn.Conv2d(c1, c2, patch_size, stride, patch_size//2) # padding=(ps[0]//2, ps[1]//2)
self.norm = nn.LayerNorm(c2)
def forward(self, x: Tensor) -> Tensor:
x = self.proj(x)
_, _, H, W = x.shape
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
return x, H, W
class Block(nn.Module):
def __init__(self, dim, head, sr_ratio=1, dpr=0.):
super().__init__()
self.norm1 = nn.LayerNorm(dim)
self.attn = Attention(dim, head, sr_ratio)
self.drop_path = DropPath(dpr) if dpr > 0. else nn.Identity()
self.norm2 = nn.LayerNorm(dim)
self.mlp = MLP(dim, int(dim*4))
def forward(self, x: Tensor, H, W) -> Tensor:
x = x + self.drop_path(self.attn(self.norm1(x), H, W))
x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
return x
# class PredictorConv(nn.Module):
# """ Image to modality score, in spatial selection
# b, h, w, c -> b, h, w, 1
# """
# def __init__(self, embed_dim=384, num_modals=4):
# super().__init__()
# self.num_modals = num_modals
# self.dwconv = ModuleParallel(nn.Conv2d(embed_dim, embed_dim, 7, 1, 3, groups=embed_dim))
# self.score_nets = nn.ModuleList([nn.Sequential(
# nn.LayerNorm(embed_dim, eps=1e-6),
# # nn.Linear(embed_dim, embed_dim),
# # nn.GELU(),
# # nn.Linear(embed_dim, embed_dim // 2),
# # nn.GELU(),
# # nn.Linear(embed_dim // 2, embed_dim // 4),
# # nn.GELU(),
# # nn.Linear(embed_dim // 4, 1),
# # nn.Linear(embed_dim, embed_dim // 4),
# # nn.GELU(),
# nn.Linear(embed_dim, 1),
# # nn.Sigmoid()
# # nn.LogSoftmax(dim=-1)
# nn.Softmax(dim=-1)
# ) for _ in range(num_modals)])
# def forward(self, x):
# x = self.dwconv(x)
# x = [x_.permute(0, 2, 3, 1) for x_ in x] # NCHW to NHWC
# x = [self.score_nets[i](x[i]) for i in range(self.num_modals)]
# x = [x_.permute(0, 3, 1, 2) for x_ in x] # NHWC to NCHW
# # for i, (xi, rat) in enumerate(zip(x, [1, 1, 0.5, 0.2])):
# # x[i] = xi * rat
# return x
class PredictorLG(nn.Module):
""" Image to Patch Embedding from DydamicVit
"""
def __init__(self, embed_dim=384, num_modals=4):
super().__init__()
self.num_modals = num_modals
self.score_nets = nn.ModuleList([nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, embed_dim // 4),
nn.GELU(),
nn.Linear(embed_dim // 4, 1),
nn.Softmax(dim=-1)
) for _ in range(num_modals)])
def forward(self, x):
x = [self.score_nets[i](x[i]) for i in range(self.num_modals)]
return x
mit_settings = {
'B0': [[32, 64, 160, 256], [2, 2, 2, 2]], # [embed_dims, depths]
'B1': [[64, 128, 320, 512], [2, 2, 2, 2]],
'B2': [[64, 128, 320, 512], [3, 4, 6, 3]],
'B3': [[64, 128, 320, 512], [3, 4, 18, 3]],
'B4': [[64, 128, 320, 512], [3, 8, 27, 3]],
'B5': [[64, 128, 320, 512], [3, 6, 40, 3]]
}
class CMX(nn.Module):
def __init__(self, model_name: str = 'B0', modals: list = ['rgb', 'depth', 'event', 'lidar']):
super().__init__()
assert model_name in mit_settings.keys(), f"Model name should be in {list(mit_settings.keys())}"
embed_dims, depths = mit_settings[model_name]
extra_depths = depths # for fusion branch
self.modals = modals[1:] if len(modals)>1 else [] # remove rgb
self.num_modals = len(self.modals)
drop_path_rate = 0.1
self.channels = embed_dims
# patch_embed
self.patch_embed1 = PatchEmbed(3, embed_dims[0], 7, 4)
self.patch_embed2 = PatchEmbed(embed_dims[0], embed_dims[1], 3, 2)
self.patch_embed3 = PatchEmbed(embed_dims[1], embed_dims[2], 3, 2)
self.patch_embed4 = PatchEmbed(embed_dims[2], embed_dims[3], 3, 2)
if self.num_modals > 0:
self.extra_patch_embed1 = PatchEmbed(3, embed_dims[0], 7, 4)
self.extra_patch_embed2 = PatchEmbed(embed_dims[0], embed_dims[1], 3, 2)
self.extra_patch_embed3 = PatchEmbed(embed_dims[1], embed_dims[2], 3, 2)
self.extra_patch_embed4 = PatchEmbed(embed_dims[2], embed_dims[3], 3, 2)
if self.num_modals > 1:
# self.extra_score_predictor = nn.ModuleList([PredictorConv(embed_dims[i], self.num_modals) for i in range(len(depths))])
self.extra_score_predictor = nn.ModuleList([PredictorLG(embed_dims[i], self.num_modals) for i in range(len(depths))])
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
cur = 0
self.block1 = nn.ModuleList([Block(embed_dims[0], 1, 8, dpr[cur+i]) for i in range(depths[0])])
self.norm1 = nn.LayerNorm(embed_dims[0])
if self.num_modals > 0:
self.extra_block1 = nn.ModuleList([Block(embed_dims[0], 1, 8, dpr[cur+i]) for i in range(depths[0])])
self.extra_norm1 = nn.LayerNorm(embed_dims[0])
cur += depths[0]
self.block2 = nn.ModuleList([Block(embed_dims[1], 2, 4, dpr[cur+i]) for i in range(depths[1])])
self.norm2 = nn.LayerNorm(embed_dims[1])
if self.num_modals > 0:
self.extra_block2 = nn.ModuleList([Block(embed_dims[1], 2, 4, dpr[cur+i]) for i in range(depths[1])])
self.extra_norm2 = nn.LayerNorm(embed_dims[1])
cur += depths[1]
self.block3 = nn.ModuleList([Block(embed_dims[2], 5, 2, dpr[cur+i]) for i in range(depths[2])])
self.norm3 = nn.LayerNorm(embed_dims[2])
if self.num_modals > 0:
self.extra_block3 = nn.ModuleList([Block(embed_dims[2], 5, 2, dpr[cur+i]) for i in range(depths[2])])
self.extra_norm3 = nn.LayerNorm(embed_dims[2])
cur += depths[2]
self.block4 = nn.ModuleList([Block(embed_dims[3], 8, 1, dpr[cur+i]) for i in range(depths[3])])
self.norm4 = nn.LayerNorm(embed_dims[3])
if self.num_modals > 0:
self.extra_block4 = nn.ModuleList([Block(embed_dims[3], 8, 1, dpr[cur+i]) for i in range(depths[3])])
self.extra_norm4 = nn.LayerNorm(embed_dims[3])
if self.num_modals > 0:
num_heads = [1,2,5,8]
self.FRMs = nn.ModuleList([
FRM(dim=embed_dims[0], reduction=1),
FRM(dim=embed_dims[1], reduction=1),
FRM(dim=embed_dims[2], reduction=1),
FRM(dim=embed_dims[3], reduction=1)])
self.FFMs = nn.ModuleList([
FFM(dim=embed_dims[0], reduction=1, num_heads=num_heads[0], norm_layer=nn.BatchNorm2d),
FFM(dim=embed_dims[1], reduction=1, num_heads=num_heads[1], norm_layer=nn.BatchNorm2d),
FFM(dim=embed_dims[2], reduction=1, num_heads=num_heads[2], norm_layer=nn.BatchNorm2d),
FFM(dim=embed_dims[3], reduction=1, num_heads=num_heads[3], norm_layer=nn.BatchNorm2d)])
# ---Hard selection
def tokenselect(self, x_ext, module):
x_scores = module(x_ext)
# select tokens according to the max score of multiple modals, regarding H, W
x_stack = torch.stack(x_ext, dim=-1) # B, N, C, N_modals
B, N, C, N_modals = x_stack.shape
x_scores = torch.stack(x_scores, dim=-1) # B, N, 1, N_modals
x_index = torch.argmax(x_scores, dim=-1, keepdim=True) # B, N, 1, N_modals
x_index = einops.repeat(x_index, 'b n 1 m -> b n c m', c=C) # B, C, H, W, N_modals
# --- token selection
x_select = x_stack.gather(-1, x_index)
return x_select.squeeze(-1) # B, C, H, W
def forward(self, x: list) -> list:
x_cam = x[0]
if self.num_modals > 0:
x_ext = x[1:]
B = x_cam.shape[0]
outs = []
# stage 1
x_cam, H, W = self.patch_embed1(x_cam)
for blk in self.block1:
x_cam = blk(x_cam, H, W)
x_cam = self.norm1(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext = [self.extra_patch_embed1(x_ext_)[0] for x_ext_ in x_ext]
x_f = self.tokenselect(x_ext, self.extra_score_predictor[0]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block1:
x_f = blk(x_f, H, W)
x_f = self.extra_norm1(x_f).reshape(B, H, W, -1).permute(0, 3, 1, 2)
# --- FFM
x_cam, x_f = self.FRMs[0](x_cam, x_f)
x_fused = self.FFMs[0](x_cam, x_f)
outs.append(x_fused)
x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x_f for x_ in x_ext] if self.num_modals > 1 else [x_f]
else:
outs.append(x_cam)
# stage 2
x_cam, H, W = self.patch_embed2(x_cam)
for blk in self.block2:
x_cam = blk(x_cam, H, W)
x_cam = self.norm2(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext = [self.extra_patch_embed2(x_ext_)[0] for x_ext_ in x_ext]
x_f = self.tokenselect(x_ext, self.extra_score_predictor[1]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block2:
x_f = blk(x_f, H, W)
x_f = self.extra_norm2(x_f).reshape(B, H, W, -1).permute(0, 3, 1, 2)
# --- FFM
x_cam, x_f = self.FRMs[1](x_cam, x_f)
x_fused = self.FFMs[1](x_cam, x_f)
outs.append(x_fused)
x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x_f for x_ in x_ext] if self.num_modals > 1 else [x_f]
else:
outs.append(x_cam)
# stage 3
x_cam, H, W = self.patch_embed3(x_cam)
for blk in self.block3:
x_cam = blk(x_cam, H, W)
x_cam = self.norm3(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext = [self.extra_patch_embed3(x_ext_)[0] for x_ext_ in x_ext]
x_f = self.tokenselect(x_ext, self.extra_score_predictor[2]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block3:
x_f = blk(x_f, H, W)
x_f = self.extra_norm3(x_f).reshape(B, H, W, -1).permute(0, 3, 1, 2)
# --- FFM
x_cam, x_f = self.FRMs[2](x_cam, x_f)
x_fused = self.FFMs[2](x_cam, x_f)
outs.append(x_fused)
x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x_f for x_ in x_ext] if self.num_modals > 1 else [x_f]
else:
outs.append(x_cam)
# stage 4
x_cam, H, W = self.patch_embed4(x_cam)
for blk in self.block4:
x_cam = blk(x_cam, H, W)
x_cam = self.norm4(x_cam).reshape(B, H, W, -1).permute(0, 3, 1, 2)
if self.num_modals > 0:
x_ext = [self.extra_patch_embed4(x_ext_)[0] for x_ext_ in x_ext]
x_f = self.tokenselect(x_ext, self.extra_score_predictor[3]) if self.num_modals > 1 else x_ext[0]
for blk in self.extra_block4:
x_f = blk(x_f, H, W)
x_f = self.extra_norm4(x_f).reshape(B, H, W, -1).permute(0, 3, 1, 2)
# --- FFM
x_cam, x_f = self.FRMs[3](x_cam, x_f)
x_fused = self.FFMs[3](x_cam, x_f)
outs.append(x_fused)
# x_ext = [x_.reshape(B, H, W, -1).permute(0, 3, 1, 2) + x_f for x_ in x_ext] if self.num_modals > 1 else [x_f]
else:
outs.append(x_cam)
return outs
if __name__ == '__main__':
modals = ['img']
# modals = ['img', 'depth', 'event', 'lidar']
# x = [torch.zeros(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024)*2, torch.ones(1, 3, 1024, 1024) *3]
# modals = ['img', 'depth']
# x = [torch.zeros(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024)]
x = [torch.zeros(1, 3, 1024, 1024)]
model = CMX('B2', modals)
outs = model(x)
print(model)
for y in outs:
print(y.shape)
# print(flop_count_table(FlopCountAnalysis(model, x)))
================================================
FILE: semseg/models/base.py
================================================
import torch
import math
from torch import nn
from semseg.models.backbones import *
from semseg.models.layers import trunc_normal_
from collections import OrderedDict
def load_dualpath_model(model, model_file):
# load raw state_dict
if isinstance(model_file, str):
raw_state_dict = torch.load(model_file, map_location=torch.device('cpu'))
#raw_state_dict = torch.load(model_file)
if 'model' in raw_state_dict.keys():
raw_state_dict = raw_state_dict['model']
else:
raw_state_dict = model_file
state_dict = {}
for k, v in raw_state_dict.items():
if k.find('patch_embed') >= 0:
state_dict[k] = v
# patch_embedx, proj, weight = k.split('.')
# state_dict[k.replace('patch_embed', 'extra_patch_embed')] = v
# state_dict[new_k] = v
elif k.find('block') >= 0:
state_dict[k] = v
# state_dict[k.replace('block', 'extra_block')] = v
elif k.find('norm') >= 0:
state_dict[k] = v
# state_dict[k.replace('norm', 'extra_norm')] = v
msg = model.load_state_dict(state_dict, strict=False)
print(msg)
del state_dict
class BaseModel(nn.Module):
def __init__(self, backbone: str = 'MiT-B0', num_classes: int = 19, modals: list = ['rgb', 'depth', 'event', 'lidar']) -> None:
super().__init__()
backbone, variant = backbone.split('-')
self.backbone = eval(backbone)(variant, modals)
# self.backbone = eval(backbone)(variant)
self.modals = modals
def _init_weights(self, m: nn.Module) -> None:
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out // m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
def init_pretrained(self, pretrained: str = None) -> None:
if pretrained:
if len(self.modals)>1:
load_dualpath_model(self.backbone, pretrained)
else:
checkpoint = torch.load(pretrained, map_location='cpu')
if 'state_dict' in checkpoint.keys():
checkpoint = checkpoint['state_dict']
# if 'PoolFormer' in self.__class__.__name__:
# new_dict = OrderedDict()
# for k, v in checkpoint.items():
# if not 'backbone.' in k:
# new_dict['backbone.'+k] = v
# else:
# new_dict[k] = v
# checkpoint = new_dict
if 'model' in checkpoint.keys(): # --- for HorNet
checkpoint = checkpoint['model']
msg = self.backbone.load_state_dict(checkpoint, strict=False)
print(msg)
================================================
FILE: semseg/models/cmnext.py
================================================
import torch
from torch import Tensor
from torch.nn import functional as F
from semseg.models.base import BaseModel
from semseg.models.heads import SegFormerHead
from semseg.models.heads import LightHamHead
from semseg.models.heads import UPerHead
from fvcore.nn import flop_count_table, FlopCountAnalysis
class CMNeXt(BaseModel):
def __init__(self, backbone: str = 'CMNeXt-B0', num_classes: int = 25, modals: list = ['img', 'depth', 'event', 'lidar']) -> None:
super().__init__(backbone, num_classes, modals)
self.decode_head = SegFormerHead(self.backbone.channels, 256 if 'B0' in backbone or 'B1' in backbone else 512, num_classes)
self.apply(self._init_weights)
def forward(self, x: list) -> list:
y = self.backbone(x)
y = self.decode_head(y)
y = F.interpolate(y, size=x[0].shape[2:], mode='bilinear', align_corners=False)
return y
def init_pretrained(self, pretrained: str = None) -> None:
if pretrained:
if self.backbone.num_modals > 0:
load_dualpath_model(self.backbone, pretrained)
else:
checkpoint = torch.load(pretrained, map_location='cpu')
if 'state_dict' in checkpoint.keys():
checkpoint = checkpoint['state_dict']
if 'model' in checkpoint.keys():
checkpoint = checkpoint['model']
msg = self.backbone.load_state_dict(checkpoint, strict=False)
print(msg)
def load_dualpath_model(model, model_file):
extra_pretrained = None
if isinstance(extra_pretrained, str):
raw_state_dict_ext = torch.load(extra_pretrained, map_location=torch.device('cpu'))
if 'state_dict' in raw_state_dict_ext.keys():
raw_state_dict_ext = raw_state_dict_ext['state_dict']
if isinstance(model_file, str):
raw_state_dict = torch.load(model_file, map_location=torch.device('cpu'))
if 'model' in raw_state_dict.keys():
raw_state_dict = raw_state_dict['model']
else:
raw_state_dict = model_file
state_dict = {}
for k, v in raw_state_dict.items():
if k.find('patch_embed') >= 0:
state_dict[k] = v
elif k.find('block') >= 0:
state_dict[k] = v
elif k.find('norm') >= 0:
state_dict[k] = v
if isinstance(extra_pretrained, str):
for k, v in raw_state_dict_ext.items():
if k.find('patch_embed1.proj') >= 0:
state_dict[k.replace('patch_embed1.proj', 'extra_downsample_layers.0.proj.module')] = v
if k.find('patch_embed2.proj') >= 0:
state_dict[k.replace('patch_embed2.proj', 'extra_downsample_layers.1.proj.module')] = v
if k.find('patch_embed3.proj') >= 0:
state_dict[k.replace('patch_embed3.proj', 'extra_downsample_layers.2.proj.module')] = v
if k.find('patch_embed4.proj') >= 0:
state_dict[k.replace('patch_embed4.proj', 'extra_downsample_layers.3.proj.module')] = v
if k.find('patch_embed1.norm') >= 0:
for i in range(model.num_modals):
state_dict[k.replace('patch_embed1.norm', 'extra_downsample_layers.0.norm.ln_{}'.format(i))] = v
if k.find('patch_embed2.norm') >= 0:
for i in range(model.num_modals):
state_dict[k.replace('patch_embed2.norm', 'extra_downsample_layers.1.norm.ln_{}'.format(i))] = v
if k.find('patch_embed3.norm') >= 0:
for i in range(model.num_modals):
state_dict[k.replace('patch_embed3.norm', 'extra_downsample_layers.2.norm.ln_{}'.format(i))] = v
if k.find('patch_embed4.norm') >= 0:
for i in range(model.num_modals):
state_dict[k.replace('patch_embed4.norm', 'extra_downsample_layers.3.norm.ln_{}'.format(i))] = v
elif k.find('block') >= 0:
state_dict[k.replace('block', 'extra_block')] = v
elif k.find('norm') >= 0:
state_dict[k.replace('norm', 'extra_norm')] = v
msg = model.load_state_dict(state_dict, strict=False)
del state_dict
if __name__ == '__main__':
modals = ['img', 'depth', 'event', 'lidar']
model = CMNeXt('CMNeXt-B2', 25, modals)
model.init_pretrained('checkpoints/pretrained/segformer/mit_b2.pth')
x = [torch.zeros(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024), torch.ones(1, 3, 1024, 1024)*2, torch.ones(1, 3, 1024, 1024) *3]
y = model(x)
print(y.shape)
================================================
FILE: semseg/models/cmx.py
================================================
import torch
from torch import Tensor
from torch.nn import functional as F
from semseg.models.base import BaseModel
from semseg.models.heads import SegFormerHead
from fvcore.nn import flop_count_table, FlopCountAnalysis
class CMX(BaseModel):
def __init__(self, backbone: str = 'CMX-B0', num_classes: int = 25, modals: list = ['img', 'depth', 'event', 'lidar']) -> None:
super().__init__(backbone, num_classes, modals)
self.decode_head = SegFormerHead(self.backbone.channels, 256 if 'B0' in backbone or 'B1' in backbone else 512, num_classes)
self.apply(self._init_weights)
def forward(self, x: list) -> list:
y = self.backbone(x)
y = self.decode_head(y)
y = F.interpolate(y, size=x[0].shape[2:], mode='bilinear', align_corners=False) # to original image shape
return y
def init_pretrained(self, pretrained: str = None) -> None:
if pretrained:
if self.backbone.num_modals > 0:
load_dualpath_model(self.backbone, pretrained)
else:
checkpoint = torch.load(pretrained, map_location='cpu')
if 'state_dict' in checkpoint.keys():
checkpoint = checkpoint['state_dict']
if 'model' in checkpoint.keys():
checkpoint = checkpoint['model']
msg = self.backbone.load_state_dict(checkpoint, strict=False)
print(msg)
def load_dualpath_model(model, model_file):
if isinstance(model_file, str):
raw_state_dict = torch.load(model_file, map_location=torch.device('cpu'))
if 'model' in raw_state_dict.keys():
raw_state_dict = raw_state_dict['model']
else:
raw_state_dict = model_file
state_dict = {}
for k, v in raw_state_dict.items():
if k.find('patch_embed') >= 0:
state_dict[k] = v
elif k.find('block') >= 0:
state_dict[k] = v
elif k.find('norm') >= 0:
state_dict[k] = v
msg = model.load_state_dict(state_dict, strict=False)
del state_dict
if __name__ == '__main__':
modals = ['img']
# modals = ['img', 'depth', 'event', 'lidar']
model = CMX('CMX-B2', 25, modals)
model.init_pretrained('checkpoints/pretrained/segformer/mit_b2.pth')
x = [torch.zeros(1, 3, 512, 512)]
y = model(x)
print(y.shape)
================================================
FILE: semseg/models/heads/__init__.py
================================================
from .upernet import UPerHead
from .segformer import SegFormerHead
from .sfnet import SFHead
from .fpn import FPNHead
from .fapn import FaPNHead
from .fcn import FCNHead
from .condnet import CondHead
from .lawin import LawinHead
from .hem import LightHamHead
__all__ = ['UPerHead', 'SegFormerHead']
================================================
FILE: semseg/models/heads/condnet.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from semseg.models.layers import ConvModule
class CondHead(nn.Module):
def __init__(self, in_channel: int = 2048, channel: int = 512, num_classes: int = 19):
super().__init__()
self.num_classes = num_classes
self.weight_num = channel * num_classes
self.bias_num = num_classes
self.conv = ConvModule(in_channel, channel, 1)
self.dropout = nn.Dropout2d(0.1)
self.guidance_project = nn.Conv2d(channel, num_classes, 1)
self.filter_project = nn.Conv2d(channel*num_classes, self.weight_num + self.bias_num, 1, groups=num_classes)
def forward(self, features) -> Tensor:
x = self.dropout(self.conv(features[-1]))
B, C, H, W = x.shape
guidance_mask = self.guidance_project(x)
cond_logit = guidance_mask
key = x
value = x
guidance_mask = guidance_mask.softmax(dim=1).view(*guidance_mask.shape[:2], -1)
key = key.view(B, C, -1).permute(0, 2, 1)
cond_filters = torch.matmul(guidance_mask, key)
cond_filters /= H * W
cond_filters = cond_filters.view(B, -1, 1, 1)
cond_filters = self.filter_project(cond_filters)
cond_filters = cond_filters.view(B, -1)
weight, bias = torch.split(cond_filters, [self.weight_num, self.bias_num], dim=1)
weight = weight.reshape(B * self.num_classes, -1, 1, 1)
bias = bias.reshape(B * self.num_classes)
value = value.view(-1, H, W).unsqueeze(0)
seg_logit = F.conv2d(value, weight, bias, 1, 0, groups=B).view(B, self.num_classes, H, W)
if self.training:
return cond_logit, seg_logit
return seg_logit
if __name__ == '__main__':
from semseg.models.backbones import ResNetD
backbone = ResNetD('50')
head = CondHead()
x = torch.randn(2, 3, 224, 224)
features = backbone(x)
outs = head(features)
for out in outs:
out = F.interpolate(out, size=x.shape[-2:], mode='bilinear', align_corners=False)
print(out.shape)
================================================
FILE: semseg/models/heads/fapn.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from torchvision.ops import DeformConv2d
from semseg.models.layers import ConvModule
class DCNv2(nn.Module):
def __init__(self, c1, c2, k, s, p, g=1):
super().__init__()
self.dcn = DeformConv2d(c1, c2, k, s, p, groups=g)
self.offset_mask = nn.Conv2d(c2, g* 3 * k * k, k, s, p)
self._init_offset()
def _init_offset(self):
self.offset_mask.weight.data.zero_()
self.offset_mask.bias.data.zero_()
def forward(self, x, offset):
out = self.offset_mask(offset)
o1, o2, mask = torch.chunk(out, 3, dim=1)
offset = torch.cat([o1, o2], dim=1)
mask = mask.sigmoid()
return self.dcn(x, offset, mask)
class FSM(nn.Module):
def __init__(self, c1, c2):
super().__init__()
self.conv_atten = nn.Conv2d(c1, c1, 1, bias=False)
self.conv = nn.Conv2d(c1, c2, 1, bias=False)
def forward(self, x: Tensor) -> Tensor:
atten = self.conv_atten(F.avg_pool2d(x, x.shape[2:])).sigmoid()
feat = torch.mul(x, atten)
x = x + feat
return self.conv(x)
class FAM(nn.Module):
def __init__(self, c1, c2):
super().__init__()
self.lateral_conv = FSM(c1, c2)
self.offset = nn.Conv2d(c2*2, c2, 1, bias=False)
self.dcpack_l2 = DCNv2(c2, c2, 3, 1, 1, 8)
def forward(self, feat_l, feat_s):
feat_up = feat_s
if feat_l.shape[2:] != feat_s.shape[2:]:
feat_up = F.interpolate(feat_s, size=feat_l.shape[2:], mode='bilinear', align_corners=False)
feat_arm = self.lateral_conv(feat_l)
offset = self.offset(torch.cat([feat_arm, feat_up*2], dim=1))
feat_align = F.relu(self.dcpack_l2(feat_up, offset))
return feat_align + feat_arm
class FaPNHead(nn.Module):
def __init__(self, in_channels, channel=128, num_classes=19):
super().__init__()
in_channels = in_channels[::-1]
self.align_modules = nn.ModuleList([ConvModule(in_channels[0], channel, 1)])
self.output_convs = nn.ModuleList([])
for ch in in_channels[1:]:
self.align_modules.append(FAM(ch, channel))
self.output_convs.append(ConvModule(channel, channel, 3, 1, 1))
self.conv_seg = nn.Conv2d(channel, num_classes, 1)
self.dropout = nn.Dropout2d(0.1)
def forward(self, features) -> Tensor:
features = features[::-1]
out = self.align_modules[0](features[0])
for feat, align_module, output_conv in zip(features[1:], self.align_modules[1:], self.output_convs):
out = align_module(feat, out)
out = output_conv(out)
out = self.conv_seg(self.dropout(out))
return out
if __name__ == '__main__':
from semseg.models.backbones import ResNet
backbone = ResNet('50')
head = FaPNHead([256, 512, 1024, 2048], 128, 19)
x = torch.randn(2, 3, 224, 224)
features = backbone(x)
out = head(features)
out = F.interpolate(out, size=x.shape[-2:], mode='bilinear', align_corners=False)
print(out.shape)
================================================
FILE: semseg/models/heads/fcn.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from semseg.models.layers import ConvModule
class FCNHead(nn.Module):
def __init__(self, c1, c2, num_classes: int = 19):
super().__init__()
self.conv = ConvModule(c1, c2, 1)
self.cls = nn.Conv2d(c2, num_classes, 1)
def forward(self, features) -> Tensor:
x = self.conv(features[-1])
x = self.cls(x)
return x
if __name__ == '__main__':
from semseg.models.backbones import ResNet
backbone = ResNet('50')
head = FCNHead(2048, 256, 19)
x = torch.randn(2, 3, 224, 224)
features = backbone(x)
out = head(features)
out = F.interpolate(out, size=x.shape[-2:], mode='bilinear', align_corners=False)
print(out.shape)
================================================
FILE: semseg/models/heads/fpn.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from semseg.models.layers import ConvModule
class FPNHead(nn.Module):
"""Panoptic Feature Pyramid Networks
https://arxiv.org/abs/1901.02446
"""
def __init__(self, in_channels, channel=128, num_classes=19):
super().__init__()
self.lateral_convs = nn.ModuleList([])
self.output_convs = nn.ModuleList([])
for ch in in_channels[::-1]:
self.lateral_convs.append(ConvModule(ch, channel, 1))
self.output_convs.append(ConvModule(channel, channel, 3, 1, 1))
self.conv_seg = nn.Conv2d(channel, num_classes, 1)
self.dropout = nn.Dropout2d(0.1)
def forward(self, features) -> Tensor:
features = features[::-1]
out = self.lateral_convs[0](features[0])
for i in range(1, len(features)):
out = F.interpolate(out, scale_factor=2.0, mode='nearest')
out = out + self.lateral_convs[i](features[i])
out = self.output_convs[i](out)
out = self.conv_seg(self.dropout(out))
return out
if __name__ == '__main__':
from semseg.models.backbones import ResNet
backbone = ResNet('50')
head = FPNHead([256, 512, 1024, 2048], 128, 19)
x = torch.randn(2, 3, 224, 224)
features = backbone(x)
out = head(features)
out = F.interpolate(out, size=x.shape[-2:], mode='bilinear', align_corners=False)
print(out.shape)
================================================
FILE: semseg/models/heads/hem.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from fvcore.nn import flop_count_table, FlopCountAnalysis
class _MatrixDecomposition2DBase(nn.Module):
def __init__(self, args=dict()):
super().__init__()
self.spatial = args.setdefault('SPATIAL', True)
self.S = args.setdefault('MD_S', 1)
self.D = args.setdefault('MD_D', 512)
self.R = args.setdefault('MD_R', 64)
self.train_steps = args.setdefault('TRAIN_STEPS', 6)
self.eval_steps = args.setdefault('EVAL_STEPS', 7)
self.inv_t = args.setdefault('INV_T', 100)
self.eta = args.setdefault('ETA', 0.9)
self.rand_init = args.setdefault('RAND_INIT', True)
print('spatial', self.spatial)
print('S', self.S)
print('D', self.D)
print('R', self.R)
print('train_steps', self.train_steps)
print('eval_steps', self.eval_steps)
print('inv_t', self.inv_t)
print('eta', self.eta)
print('rand_init', self.rand_init)
def _build_bases(self, B, S, D, R, cuda=False):
raise NotImplementedError
def local_step(self, x, bases, coef):
raise NotImplementedError
# @torch.no_grad()
def local_inference(self, x, bases):
# (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
coef = torch.bmm(x.transpose(1, 2), bases)
coef = F.softmax(self.inv_t * coef, dim=-1)
steps = self.train_steps if self.training else self.eval_steps
for _ in range(steps):
bases, coef = self.local_step(x, bases, coef)
return bases, coef
def compute_coef(self, x, bases, coef):
raise NotImplementedError
def forward(self, x, return_bases=False):
B, C, H, W = x.shape
# (B, C, H, W) -> (B * S, D, N)
if self.spatial:
D = C // self.S
N = H * W
x = x.view(B * self.S, D, N)
else:
D = H * W
N = C // self.S
x = x.view(B * self.S, N, D).transpose(1, 2)
if not self.rand_init and not hasattr(self, 'bases'):
bases = self._build_bases(1, self.S, D, self.R, cuda=True)
self.register_buffer('bases', bases)
# (S, D, R) -> (B * S, D, R)
if self.rand_init:
bases = self._build_bases(B, self.S, D, self.R, cuda=True)
else:
bases = self.bases.repeat(B, 1, 1)
bases, coef = self.local_inference(x, bases)
# (B * S, N, R)
coef = self.compute_coef(x, bases, coef)
# (B * S, D, R) @ (B * S, N, R)^T -> (B * S, D, N)
x = torch.bmm(bases, coef.transpose(1, 2))
# (B * S, D, N) -> (B, C, H, W)
if self.spatial:
x = x.view(B, C, H, W)
else:
x = x.transpose(1, 2).view(B, C, H, W)
# (B * H, D, R) -> (B, H, N, D)
bases = bases.view(B, self.S, D, self.R)
return x
class NMF2D(_MatrixDecomposition2DBase):
def __init__(self, args=dict()):
super().__init__(args)
self.inv_t = 1
def _build_bases(self, B, S, D, R, cuda=False):
if cuda:
bases = torch.rand((B * S, D, R)).cuda()
else:
bases = torch.rand((B * S, D, R))
bases = F.normalize(bases, dim=1)
return bases
# @torch.no_grad()
def local_step(self, x, bases, coef):
# (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
numerator = torch.bmm(x.transpose(1, 2), bases)
# (B * S, N, R) @ [(B * S, D, R)^T @ (B * S, D, R)] -> (B * S, N, R)
denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
# Multiplicative Update
coef = coef * numerator / (denominator + 1e-6)
# (B * S, D, N) @ (B * S, N, R) -> (B * S, D, R)
numerator = torch.bmm(x, coef)
# (B * S, D, R) @ [(B * S, N, R)^T @ (B * S, N, R)] -> (B * S, D, R)
denominator = bases.bmm(coef.transpose(1, 2).bmm(coef))
# Multiplicative Update
bases = bases * numerator / (denominator + 1e-6)
return bases, coef
def compute_coef(self, x, bases, coef):
# (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
numerator = torch.bmm(x.transpose(1, 2), bases)
# (B * S, N, R) @ (B * S, D, R)^T @ (B * S, D, R) -> (B * S, N, R)
denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
# multiplication update
coef = coef * numerator / (denominator + 1e-6)
return coef
class Hamburger(nn.Module):
def __init__(self, ham_channels=512, ham_kwargs=dict(), norm_cfg=None):
super().__init__()
self.ham_in = ConvModule(ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
self.ham = NMF2D(ham_kwargs)
self.ham_out = ConvModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
def forward(self, x):
enjoy = self.ham_in(x)
enjoy = F.relu(enjoy, inplace=True)
enjoy = self.ham(enjoy)
enjoy = self.ham_out(enjoy)
ham = F.relu(x + enjoy, inplace=True)
return ham
class LightHamHead(nn.Module):
def __init__(self, in_channels=[64, 128, 320, 512], ham_channels=512, ham_kwargs=dict(), num_classes=25):
super().__init__()
self.in_channels = in_channels[1:]
self.in_index = [1,2,3]
self.ham_channels = self.channels = ham_channels
self.conv_cfg = None
self.norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
self.act_cfg = dict(type='ReLU')
self.ham_channels = ham_channels
self.squeeze = ConvModule(sum(self.in_channels), self.ham_channels, 1, conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)
self.hamburger = Hamburger(ham_channels, ham_kwargs, self.norm_cfg)
self.align = ConvModule(self.ham_channels, self.channels, 1, conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)
self.conv_seg = nn.Conv2d(self.channels, num_classes, kernel_size=1)
def forward(self, inputs):
"""Forward function."""
inputs = [inputs[i] for i in self.in_index]
inputs = [F.interpolate(level, size=inputs[0].shape[2:], mode='bilinear', align_corners=False) for level in inputs]
inputs = torch.cat(inputs, dim=1)
x = self.squeeze(inputs)
x = self.hamburger(x)
output = self.align(x)
output = self.conv_seg(output)
return output
if __name__ == '__main__':
model = LightHamHead(num_classes=25)
model = model.cuda()
x = [torch.zeros(1, 64, 256, 256), torch.ones(1, 128, 128, 128), torch.ones(1, 320, 64, 64)*2, torch.ones(1, 512, 32, 32) *3]
x = [xi.cuda() for xi in x]
outs = model(x)
print(model)
for y in outs:
print(y.shape)
print(flop_count_table(FlopCountAnalysis(model, x)))
================================================
FILE: semseg/models/heads/lawin.py
================================================
import torch
from torch import nn, Tensor
from torch.nn import functional as F
from einops import rearrange
class MLP(nn.Module):
def __init__(self, dim=2048, embed_dim=768):
super().__init__()
self.proj = nn.Linear(dim, embed_dim)
def forward(self, x: Tensor) -> Tensor:
x = x.flatten(2).transpose(1, 2)
x = self.proj(x)
return x
class PatchEmbed(nn.Module):
def __init__(self, patch_size=4, in_ch=3, dim=96, type='pool') -> None:
super().__init__()
self.patch_size = patch_size
self.type = type
self.dim = dim
if type == 'conv':
self.proj = nn.Conv2d(in_ch, dim, patch_size, patch_size, groups=patch_size*patch_size)
else:
self.proj = nn.ModuleList([
nn.MaxPool2d(patch_size, patch_size),
nn.AvgPool2d(patch_size, patch_size)
])
self.norm = nn.LayerNorm(dim)
def forward(self, x: Tensor) -> Tensor:
_, _, H, W = x.shape
if W % self.patch_size != 0:
x = F.pad(x, (0, self.patch_size - W % self.patch_size))
if H % self.patch_size != 0:
x = F.pad(x, (0, 0, 0, self.patch_size - H % self.patch_size))
if self.type == 'conv':
x = self.proj(x)
else:
x = 0.5 * (self.proj[0](x) + self.proj[1](x))
Wh, Ww = x.size(2), x.size(3)
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
x = x.transpose(1, 2).view(-1, self.dim, Wh, Ww)
return x
class LawinAttn(nn.Module):
def __init__(self, in_ch=512, head=4, patch_size=8, reduction=2) -> None:
super().__init__()
self.head = head
self.position_mixing = nn.ModuleList([
nn.Linear(patch_size * patch_size, patch_size * patch_size)
for _ in range(self.head)])
self.inter_channels = max(in_ch // reduction, 1)
self.g = nn.Conv2d(in_ch, self.inter_channels, 1)
self.theta = nn.Conv2d(in_ch, self.inter_channels, 1)
self.phi = nn.Conv2d(in_ch, self.inter_channels, 1)
self.conv_out = nn.Sequential(
nn.Conv2d(self.inter_channels, in_ch, 1, bias=False),
nn.BatchNorm2d(in_ch)
)
def forward(self, query: Tensor, context: Tensor) -> Tensor:
B, C, H, W = context.shape
context = context.reshape(B, C, -1)
context_mlp = []
for i, pm in enumerate(self.position_mixing):
context_crt = context[:, (C//self.head)*i:(C//self.head)*(i+1), :]
context_mlp.append(pm(context_crt))
context_mlp = torch.cat(context_mlp, dim=1)
context = context + context_mlp
context = context.reshape(B, C, H, W)
g_x = self.g(context).view(B, self.inter_channels, -1)
g_x = rearrange(g_x, "b (h dim) n -> (b h) dim n", h=self.head)
g_x = g_x.permute(0, 2, 1)
theta_x = self.theta(query).view(B, self.inter_channels, -1)
theta_x = rearrange(theta_x, "b (h dim) n -> (b h) dim n", h=self.head)
theta_x = theta_x.permute(0, 2, 1)
phi_x = self.phi(context).view(B, self.inter_channels, -1)
phi_x = rearrange(phi_x, "b (h dim) n -> (b h) dim n", h=self.head)
pairwise_weight = torch.matmul(theta_x, phi_x)
pairwise_weight /= theta_x.shape[-1]**0.5
pairwise_weight = pairwise_weight.softmax(dim=-1)
y = torch.matmul(pairwise_weight, g_x)
y = rearrange(y, "(b h) n dim -> b n (h dim)", h=self.head)
y = y.permute(0, 2, 1).contiguous().reshape(B, self.inter_channels, *query.shape[-2:])
output = query + self.conv_out(y)
return output
class ConvModule(nn.Module):
def __init__(self, c1, c2):
super().__init__()
self.conv = nn.Conv2d(c1, c2, 1, bias=False)
self.bn = nn.BatchNorm2d(c2) # use SyncBN in original
self.activate = nn.ReLU(True)
def forward(self, x: Tensor) -> Tensor:
return self.activate(self.bn(self.conv(x)))
class LawinHead(nn.Module):
def __init__(self, in_channels: list, embed_dim=512, num_classes=19) -> None:
super().__init__()
for i, dim in enumerate(in_channels):
self.add_module(f"linear_c{i+1}", MLP(dim, 48 if i == 0 else embed_dim))
self.lawin_8 = LawinAttn(embed_dim, 64)
self.lawin_4 = LawinAttn(embed_dim, 16)
self.lawin_2 = LawinAttn(embed_dim, 4)
self.ds_8 = PatchEmbed(8, embed_dim, embed_dim)
self.ds_4 = PatchEmbed(4, embed_dim, embed_dim)
self.ds_2 = PatchEmbed(2, embed_dim, embed_dim)
self.image_pool = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
ConvModule(embed_dim, embed_dim)
)
self.linear_fuse = ConvModule(embed_dim*3, embed_dim)
self.short_path = ConvModule(embed_dim, embed_dim)
self.cat = ConvModule(embed_dim*5, embed_dim)
self.low_level_fuse = ConvModule(embed_dim+48, embed_dim)
self.linear_pred = nn.Conv2d(embed_dim, num_classes, 1)
self.dropout = nn.Dropout2d(0.1)
def get_lawin_att_feats(self, x: Tensor, patch_size: int):
_, _, H, W = x.shape
query = F.unfold(x, patch_size, stride=patch_size)
query = rearrange(query, 'b (c ph pw) (nh nw) -> (b nh nw) c ph pw', ph=patch_size, pw=patch_size, nh=H//patch_size, nw=W//patch_size)
outs = []
for r in [8, 4, 2]:
context = F.unfold(x, patch_size*r, stride=patch_size, padding=int((r-1)/2*patch_size))
context = rearrange(context, "b (c ph pw) (nh nw) -> (b nh nw) c ph pw", ph=patch_size*r, pw=patch_size*r, nh=H//patch_size, nw=W//patch_size)
context = getattr(self, f"ds_{r}")(context)
output = getattr(self, f"lawin_{r}")(query, context)
output = rearrange(output, "(b nh nw) c ph pw -> b c (nh ph) (nw pw)", ph=patch_size, pw=patch_size, nh=H//patch_size, nw=W//patch_size)
outs.append(output)
return outs
def forward(self, features):
B, _, H, W = features[1].shape
outs = [self.linear_c2(features[1]).permute(0, 2, 1).reshape(B, -1, *features[1].shape[-2:])]
for i, feature in enumerate(features[2:]):
cf = eval(f"self.linear_c{i+3}")(feature).permute(0, 2, 1).reshape(B, -1, *feature.shape[-2:])
outs.append(F.interpolate(cf, size=(H, W), mode='bilinear', align_corners=False))
feat = self.linear_fuse(torch.cat(outs[::-1], dim=1))
B, _, H, W = feat.shape
## Lawin attention spatial pyramid pooling
feat_short = self.short_path(feat)
feat_pool = F.interpolate(self.image_pool(feat), size=(H, W), mode='bilinear', align_corners=False)
feat_lawin = self.get_lawin_att_feats(feat, 8)
output = self.cat(torch.cat([feat_short, feat_pool, *feat_lawin], dim=1))
## Low-level feature enhancement
c1 = self.linear_c1(features[0]).permute(0, 2, 1).reshape(B, -1, *features[0].shape[-2:])
output = F.interpolate(output, size=features[0].shape[-2:], mode='bilinear', align_corners=False)
fused = self.low_level_fuse(torch.cat([output, c1], dim=1))
seg = self.linear_pred(self.dropout(fused))
return seg
================================================
FILE: semseg/models/heads/segformer.py
================================================
import torch
from torch import nn, Tensor
from typing import Tuple
from torch.nn import functional as F
class MLP(nn.Module):
def __init__(self, dim, embed_dim):
super().__init__()
self.proj = nn.Linear(dim, embed_dim)
def forward(self, x: Tensor) -> Tensor:
x = x.flatten(2).transpose(1, 2)
x = self.proj(x)
return x
class ConvModule(nn.Module):
def __init__(self, c1, c2):
super().__init__()
self.conv = nn.Conv2d(c1, c2, 1, bias=False)
self.bn = nn.BatchNorm2d(c2) # use SyncBN in original
self.activate = nn.ReLU(True)
def forward(self, x: Tensor) -> Tensor:
return self.activate(self.bn(self.conv(x)))
class SegFormerHead(nn.Module):
def __init__(self, dims: list, embed_dim: int = 256, num_classes: int = 19):
super().__init__()
for i, dim in enumerate(dims):
self.add_module(f"linear_c{i+1}", MLP(dim, embed_dim))
self.linear_fuse = ConvModule(embed_dim*4, embed_dim)
self.linear_pred = nn.Conv2d(embed_dim, num_classes, 1)
self.dropout = nn.Dropout2d(0.1)
def forward(self, features: Tuple[Tensor, Tensor, Tensor, Tensor]) -> T
gitextract_aaso4r_f/
├── .gitignore
├── LICENSE
├── README.md
├── configs/
│ ├── deliver_rgbdel.yaml
│ ├── kitti360_rgbdel.yaml
│ ├── mcubes_rgbadn.yaml
│ ├── mfnet_rgbt.yaml
│ ├── nyu_rgbd.yaml
│ └── urbanlf.yaml
├── environment.yaml
├── requirements.txt
├── semseg/
│ ├── __init__.py
│ ├── augmentations.py
│ ├── augmentations_mm.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── deliver.py
│ │ ├── kitti360.py
│ │ ├── mcubes.py
│ │ ├── mfnet.py
│ │ ├── nyu.py
│ │ ├── unzip.py
│ │ └── urbanlf.py
│ ├── losses.py
│ ├── metrics.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── cmnext.py
│ │ │ └── cmx.py
│ │ ├── base.py
│ │ ├── cmnext.py
│ │ ├── cmx.py
│ │ ├── heads/
│ │ │ ├── __init__.py
│ │ │ ├── condnet.py
│ │ │ ├── fapn.py
│ │ │ ├── fcn.py
│ │ │ ├── fpn.py
│ │ │ ├── hem.py
│ │ │ ├── lawin.py
│ │ │ ├── segformer.py
│ │ │ ├── sfnet.py
│ │ │ └── upernet.py
│ │ ├── layers/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ └── initialize.py
│ │ └── modules/
│ │ ├── __init__.py
│ │ ├── crossatt.py
│ │ ├── ffm.py
│ │ ├── mspa.py
│ │ ├── ppm.py
│ │ └── psa.py
│ ├── optimizers.py
│ ├── schedulers.py
│ └── utils/
│ ├── __init__.py
│ ├── utils.py
│ └── visualize.py
└── tools/
├── infer_mm.py
├── train_mm.py
└── val_mm.py
SYMBOL INDEX (463 symbols across 39 files)
FILE: semseg/__init__.py
function show_models (line 7) | def show_models():
function show_backbones (line 13) | def show_backbones():
function show_heads (line 24) | def show_heads():
function show_datasets (line 30) | def show_datasets():
FILE: semseg/augmentations.py
class Compose (line 9) | class Compose:
method __init__ (line 10) | def __init__(self, transforms: list) -> None:
method __call__ (line 13) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Normalize (line 25) | class Normalize:
method __init__ (line 26) | def __init__(self, mean: list = (0.485, 0.456, 0.406), std: list = (0....
method __call__ (line 30) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class ColorJitter (line 37) | class ColorJitter:
method __init__ (line 38) | def __init__(self, brightness=0, contrast=0, saturation=0, hue=0) -> N...
method __call__ (line 44) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class AdjustGamma (line 56) | class AdjustGamma:
method __init__ (line 57) | def __init__(self, gamma: float, gain: float = 1) -> None:
method __call__ (line 66) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomAdjustSharpness (line 70) | class RandomAdjustSharpness:
method __init__ (line 71) | def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
method __call__ (line 75) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomAutoContrast (line 81) | class RandomAutoContrast:
method __init__ (line 82) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 85) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomGaussianBlur (line 91) | class RandomGaussianBlur:
method __init__ (line 92) | def __init__(self, kernel_size: int = 3, p: float = 0.5) -> None:
method __call__ (line 96) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomHorizontalFlip (line 102) | class RandomHorizontalFlip:
method __init__ (line 103) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 106) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomVerticalFlip (line 112) | class RandomVerticalFlip:
method __init__ (line 113) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 116) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomGrayscale (line 122) | class RandomGrayscale:
method __init__ (line 123) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 126) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Equalize (line 132) | class Equalize:
method __call__ (line 133) | def __call__(self, image, label):
class Posterize (line 137) | class Posterize:
method __init__ (line 138) | def __init__(self, bits=2):
method __call__ (line 141) | def __call__(self, image, label):
class Affine (line 145) | class Affine:
method __init__ (line 146) | def __init__(self, angle=0, translate=[0, 0], scale=1.0, shear=[0, 0],...
method __call__ (line 153) | def __call__(self, img, label):
class RandomRotation (line 157) | class RandomRotation:
method __init__ (line 158) | def __init__(self, degrees: float = 10.0, p: float = 0.2, seg_fill: in...
method __call__ (line 174) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class CenterCrop (line 182) | class CenterCrop:
method __init__ (line 183) | def __init__(self, size: Union[int, List[int], Tuple[int]]) -> None:
method __call__ (line 191) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomCrop (line 195) | class RandomCrop:
method __init__ (line 196) | def __init__(self, size: Union[int, List[int], Tuple[int]], p: float =...
method __call__ (line 205) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Pad (line 221) | class Pad:
method __init__ (line 222) | def __init__(self, size: Union[List[int], Tuple[int], int], seg_fill: ...
method __call__ (line 231) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class ResizePad (line 236) | class ResizePad:
method __init__ (line 237) | def __init__(self, size: Union[int, Tuple[int], List[int]], seg_fill: ...
method __call__ (line 247) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Resize (line 265) | class Resize:
method __init__ (line 266) | def __init__(self, size: Union[int, Tuple[int], List[int]]) -> None:
method __call__ (line 275) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomResizedCrop (line 291) | class RandomResizedCrop:
method __init__ (line 292) | def __init__(self, size: Union[int, Tuple[int], List[int]], scale: Tup...
method __call__ (line 299) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
function get_train_augmentation (line 334) | def get_train_augmentation(size: Union[int, Tuple[int], List[int]], seg_...
function get_val_augmentation (line 348) | def get_val_augmentation(size: Union[int, Tuple[int], List[int]]):
FILE: semseg/augmentations_mm.py
class Compose (line 9) | class Compose:
method __init__ (line 10) | def __init__(self, transforms: list) -> None:
method __call__ (line 13) | def __call__(self, sample: list) -> list:
class Normalize (line 26) | class Normalize:
method __init__ (line 27) | def __init__(self, mean: list = (0.485, 0.456, 0.406), std: list = (0....
method __call__ (line 31) | def __call__(self, sample: list) -> list:
class RandomColorJitter (line 46) | class RandomColorJitter:
method __init__ (line 47) | def __init__(self, p=0.5) -> None:
method __call__ (line 50) | def __call__(self, sample: list) -> list:
class AdjustGamma (line 61) | class AdjustGamma:
method __init__ (line 62) | def __init__(self, gamma: float, gain: float = 1) -> None:
method __call__ (line 71) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomAdjustSharpness (line 75) | class RandomAdjustSharpness:
method __init__ (line 76) | def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
method __call__ (line 80) | def __call__(self, sample: list) -> list:
class RandomAutoContrast (line 86) | class RandomAutoContrast:
method __init__ (line 87) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 90) | def __call__(self, sample: list) -> list:
class RandomGaussianBlur (line 96) | class RandomGaussianBlur:
method __init__ (line 97) | def __init__(self, kernel_size: int = 3, p: float = 0.5) -> None:
method __call__ (line 101) | def __call__(self, sample: list) -> list:
class RandomHorizontalFlip (line 108) | class RandomHorizontalFlip:
method __init__ (line 109) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 112) | def __call__(self, sample: list) -> list:
class RandomVerticalFlip (line 120) | class RandomVerticalFlip:
method __init__ (line 121) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 124) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomGrayscale (line 130) | class RandomGrayscale:
method __init__ (line 131) | def __init__(self, p: float = 0.5) -> None:
method __call__ (line 134) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Equalize (line 140) | class Equalize:
method __call__ (line 141) | def __call__(self, image, label):
class Posterize (line 145) | class Posterize:
method __init__ (line 146) | def __init__(self, bits=2):
method __call__ (line 149) | def __call__(self, image, label):
class Affine (line 153) | class Affine:
method __init__ (line 154) | def __init__(self, angle=0, translate=[0, 0], scale=1.0, shear=[0, 0],...
method __call__ (line 161) | def __call__(self, img, label):
class RandomRotation (line 165) | class RandomRotation:
method __init__ (line 166) | def __init__(self, degrees: float = 10.0, p: float = 0.2, seg_fill: in...
method __call__ (line 182) | def __call__(self, sample: list) -> list:
class CenterCrop (line 195) | class CenterCrop:
method __init__ (line 196) | def __init__(self, size: Union[int, List[int], Tuple[int]]) -> None:
method __call__ (line 204) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class RandomCrop (line 208) | class RandomCrop:
method __init__ (line 209) | def __init__(self, size: Union[int, List[int], Tuple[int]], p: float =...
method __call__ (line 218) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Pad (line 234) | class Pad:
method __init__ (line 235) | def __init__(self, size: Union[List[int], Tuple[int], int], seg_fill: ...
method __call__ (line 244) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class ResizePad (line 249) | class ResizePad:
method __init__ (line 250) | def __init__(self, size: Union[int, Tuple[int], List[int]], seg_fill: ...
method __call__ (line 260) | def __call__(self, img: Tensor, mask: Tensor) -> Tuple[Tensor, Tensor]:
class Resize (line 278) | class Resize:
method __init__ (line 279) | def __init__(self, size: Union[int, Tuple[int], List[int]]) -> None:
method __call__ (line 288) | def __call__(self, sample:list) -> list:
class RandomResizedCrop (line 315) | class RandomResizedCrop:
method __init__ (line 316) | def __init__(self, size: Union[int, Tuple[int], List[int]], scale: Tup...
method __call__ (line 323) | def __call__(self, sample: list) -> list:
function get_train_augmentation (line 365) | def get_train_augmentation(size: Union[int, Tuple[int], List[int]], seg_...
function get_val_augmentation (line 374) | def get_val_augmentation(size: Union[int, Tuple[int], List[int]]):
FILE: semseg/datasets/deliver.py
class DELIVER (line 16) | class DELIVER(Dataset):
method __init__ (line 51) | def __init__(self, root: str = 'data/DELIVER', split: str = 'train', t...
method __len__ (line 70) | def __len__(self) -> int:
method __getitem__ (line 73) | def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
method _open_img (line 103) | def _open_img(self, file):
method encode (line 112) | def encode(self, label: Tensor) -> Tensor:
FILE: semseg/datasets/kitti360.py
class KITTI360 (line 15) | class KITTI360(Dataset):
method __init__ (line 28) | def __init__(self, root: str = 'data/KITTI360', split: str = 'train', ...
method __len__ (line 49) | def __len__(self) -> int:
method __getitem__ (line 52) | def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
method _open_img (line 81) | def _open_img(self, file):
method encode (line 90) | def encode(self, label: Tensor) -> Tensor:
method _get_file_names (line 94) | def _get_file_names(self, split_name):
FILE: semseg/datasets/mcubes.py
class MCubeS (line 19) | class MCubeS(Dataset):
method __init__ (line 47) | def __init__(self, root: str = 'data/MCubeS/multimodal_dataset', split...
method __len__ (line 74) | def __len__(self) -> int:
method __getitem__ (line 77) | def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
method transform_tr (line 120) | def transform_tr(self, sample):
method transform_val (line 130) | def transform_val(self, sample):
method _get_file_names (line 138) | def _get_file_names(self, split_name):
class Normalize (line 152) | class Normalize(object):
method __init__ (line 158) | def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
method __call__ (line 162) | def __call__(self, sample):
class ToTensor (line 185) | class ToTensor(object):
method __call__ (line 188) | def __call__(self, sample):
class RandomHorizontalFlip (line 232) | class RandomHorizontalFlip(object):
method __call__ (line 233) | def __call__(self, sample):
class RandomGaussianBlur (line 267) | class RandomGaussianBlur(object):
method __call__ (line 268) | def __call__(self, sample):
class RandomScaleCrop (line 289) | class RandomScaleCrop(object):
method __init__ (line 290) | def __init__(self, base_size, crop_size, fill=255):
method __call__ (line 295) | def __call__(self, sample):
class FixScaleCrop (line 388) | class FixScaleCrop(object):
method __init__ (line 389) | def __init__(self, crop_size):
method __call__ (line 392) | def __call__(self, sample):
FILE: semseg/datasets/mfnet.py
class MFNet (line 15) | class MFNet(Dataset):
method __init__ (line 22) | def __init__(self, root: str = 'data/MFNet', split: str = 'train', tra...
method __len__ (line 36) | def __len__(self) -> int:
method __getitem__ (line 39) | def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
method _open_img (line 59) | def _open_img(self, file):
method encode (line 68) | def encode(self, label: Tensor) -> Tensor:
method _get_file_names (line 71) | def _get_file_names(self, split_name):
FILE: semseg/datasets/nyu.py
class NYU (line 16) | class NYU(Dataset):
method __init__ (line 27) | def __init__(self, root: str = 'data/NYUDepthv2', split: str = 'train'...
method __len__ (line 40) | def __len__(self) -> int:
method __getitem__ (line 43) | def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
method _open_img (line 70) | def _open_img(self, file):
method encode (line 79) | def encode(self, label: Tensor) -> Tensor:
method _get_file_names (line 82) | def _get_file_names(self, split_name):
FILE: semseg/datasets/urbanlf.py
class UrbanLF (line 16) | class UrbanLF(Dataset):
method __init__ (line 24) | def __init__(self, root: str = 'data/UrBanLF/Syn', split: str = 'train...
method __len__ (line 38) | def __len__(self) -> int:
method __getitem__ (line 41) | def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
method _open_img (line 88) | def _open_img(self, file):
method encode (line 97) | def encode(self, label: Tensor) -> Tensor:
FILE: semseg/losses.py
class CrossEntropy (line 6) | class CrossEntropy(nn.Module):
method __init__ (line 7) | def __init__(self, ignore_label: int = 255, weight: Tensor = None, aux...
method _forward (line 12) | def _forward(self, preds: Tensor, labels: Tensor) -> Tensor:
method forward (line 16) | def forward(self, preds, labels: Tensor) -> Tensor:
class OhemCrossEntropy (line 22) | class OhemCrossEntropy(nn.Module):
method __init__ (line 23) | def __init__(self, ignore_label: int = 255, weight: Tensor = None, thr...
method _forward (line 30) | def _forward(self, preds: Tensor, labels: Tensor) -> Tensor:
method forward (line 41) | def forward(self, preds, labels: Tensor) -> Tensor:
class Dice (line 47) | class Dice(nn.Module):
method __init__ (line 48) | def __init__(self, delta: float = 0.5, aux_weights: list = [1, 0.4, 0....
method _forward (line 56) | def _forward(self, preds: Tensor, labels: Tensor) -> Tensor:
method forward (line 70) | def forward(self, preds, targets: Tensor) -> Tensor:
function get_loss (line 79) | def get_loss(loss_fn_name: str = 'CrossEntropy', ignore_label: int = 255...
FILE: semseg/metrics.py
class Metrics (line 6) | class Metrics:
method __init__ (line 7) | def __init__(self, num_classes: int, ignore_label: int, device) -> None:
method update (line 12) | def update(self, pred: Tensor, target: Tensor) -> None:
method compute_iou (line 17) | def compute_iou(self) -> Tuple[Tensor, Tensor]:
method compute_f1 (line 26) | def compute_f1(self) -> Tuple[Tensor, Tensor]:
method compute_pixel_acc (line 35) | def compute_pixel_acc(self) -> Tuple[Tensor, Tensor]:
FILE: semseg/models/backbones/cmnext.py
class Attention (line 15) | class Attention(nn.Module):
method __init__ (line 16) | def __init__(self, dim, head, sr_ratio):
method forward (line 29) | def forward(self, x: Tensor, H, W) -> Tensor:
class DWConv (line 48) | class DWConv(nn.Module):
method __init__ (line 49) | def __init__(self, dim):
method forward (line 53) | def forward(self, x: Tensor, H, W) -> Tensor:
class MLP (line 60) | class MLP(nn.Module):
method __init__ (line 61) | def __init__(self, c1, c2):
method forward (line 67) | def forward(self, x: Tensor, H, W) -> Tensor:
class PatchEmbed (line 71) | class PatchEmbed(nn.Module):
method __init__ (line 72) | def __init__(self, c1=3, c2=32, patch_size=7, stride=4, padding=0):
method forward (line 77) | def forward(self, x: Tensor) -> Tensor:
class PatchEmbedParallel (line 84) | class PatchEmbedParallel(nn.Module):
method __init__ (line 85) | def __init__(self, c1=3, c2=32, patch_size=7, stride=4, padding=0, num...
method forward (line 90) | def forward(self, x: list) -> list:
class Block (line 96) | class Block(nn.Module):
method __init__ (line 97) | def __init__(self, dim, head, sr_ratio=1, dpr=0., is_fan=False):
method forward (line 105) | def forward(self, x: Tensor, H, W) -> Tensor:
class ChannelProcessing (line 111) | class ChannelProcessing(nn.Module):
method __init__ (line 112) | def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., dro...
method forward (line 126) | def forward(self, x, H, W, atten=None):
class PredictorConv (line 146) | class PredictorConv(nn.Module):
method __init__ (line 147) | def __init__(self, embed_dim=384, num_modals=4):
method forward (line 156) | def forward(self, x):
class ModuleParallel (line 163) | class ModuleParallel(nn.Module):
method __init__ (line 164) | def __init__(self, module):
method forward (line 168) | def forward(self, x_parallel):
class ConvLayerNorm (line 171) | class ConvLayerNorm(nn.Module):
method __init__ (line 174) | def __init__(self, normalized_shape, eps=1e-6) -> None:
method forward (line 180) | def forward(self, x: Tensor) -> Tensor:
class LayerNormParallel (line 187) | class LayerNormParallel(nn.Module):
method __init__ (line 188) | def __init__(self, num_features, num_modals=4):
method forward (line 194) | def forward(self, x_parallel):
class CMNeXt (line 208) | class CMNeXt(nn.Module):
method __init__ (line 209) | def __init__(self, model_name: str = 'B0', modals: list = ['rgb', 'dep...
method tokenselect (line 277) | def tokenselect(self, x_ext, module):
method forward (line 284) | def forward(self, x: list) -> list:
FILE: semseg/models/backbones/cmx.py
class Attention (line 9) | class Attention(nn.Module):
method __init__ (line 10) | def __init__(self, dim, head, sr_ratio):
method forward (line 23) | def forward(self, x: Tensor, H, W) -> Tensor:
class DWConv (line 42) | class DWConv(nn.Module):
method __init__ (line 43) | def __init__(self, dim):
method forward (line 47) | def forward(self, x: Tensor, H, W) -> Tensor:
class MLP (line 54) | class MLP(nn.Module):
method __init__ (line 55) | def __init__(self, c1, c2):
method forward (line 61) | def forward(self, x: Tensor, H, W) -> Tensor:
class PatchEmbed (line 65) | class PatchEmbed(nn.Module):
method __init__ (line 66) | def __init__(self, c1=3, c2=32, patch_size=7, stride=4):
method forward (line 71) | def forward(self, x: Tensor) -> Tensor:
class Block (line 79) | class Block(nn.Module):
method __init__ (line 80) | def __init__(self, dim, head, sr_ratio=1, dpr=0.):
method forward (line 88) | def forward(self, x: Tensor, H, W) -> Tensor:
class PredictorLG (line 128) | class PredictorLG(nn.Module):
method __init__ (line 131) | def __init__(self, embed_dim=384, num_modals=4):
method forward (line 142) | def forward(self, x):
class CMX (line 156) | class CMX(nn.Module):
method __init__ (line 157) | def __init__(self, model_name: str = 'B0', modals: list = ['rgb', 'dep...
method tokenselect (line 230) | def tokenselect(self, x_ext, module):
method forward (line 242) | def forward(self, x: list) -> list:
FILE: semseg/models/base.py
function load_dualpath_model (line 8) | def load_dualpath_model(model, model_file):
class BaseModel (line 37) | class BaseModel(nn.Module):
method __init__ (line 38) | def __init__(self, backbone: str = 'MiT-B0', num_classes: int = 19, mo...
method _init_weights (line 45) | def _init_weights(self, m: nn.Module) -> None:
method init_pretrained (line 60) | def init_pretrained(self, pretrained: str = None) -> None:
FILE: semseg/models/cmnext.py
class CMNeXt (line 11) | class CMNeXt(BaseModel):
method __init__ (line 12) | def __init__(self, backbone: str = 'CMNeXt-B0', num_classes: int = 25,...
method forward (line 17) | def forward(self, x: list) -> list:
method init_pretrained (line 23) | def init_pretrained(self, pretrained: str = None) -> None:
function load_dualpath_model (line 36) | def load_dualpath_model(model, model_file):
FILE: semseg/models/cmx.py
class CMX (line 8) | class CMX(BaseModel):
method __init__ (line 9) | def __init__(self, backbone: str = 'CMX-B0', num_classes: int = 25, mo...
method forward (line 14) | def forward(self, x: list) -> list:
method init_pretrained (line 21) | def init_pretrained(self, pretrained: str = None) -> None:
function load_dualpath_model (line 34) | def load_dualpath_model(model, model_file):
FILE: semseg/models/heads/condnet.py
class CondHead (line 7) | class CondHead(nn.Module):
method __init__ (line 8) | def __init__(self, in_channel: int = 2048, channel: int = 512, num_cla...
method forward (line 20) | def forward(self, features) -> Tensor:
FILE: semseg/models/heads/fapn.py
class DCNv2 (line 8) | class DCNv2(nn.Module):
method __init__ (line 9) | def __init__(self, c1, c2, k, s, p, g=1):
method _init_offset (line 15) | def _init_offset(self):
method forward (line 19) | def forward(self, x, offset):
class FSM (line 27) | class FSM(nn.Module):
method __init__ (line 28) | def __init__(self, c1, c2):
method forward (line 33) | def forward(self, x: Tensor) -> Tensor:
class FAM (line 40) | class FAM(nn.Module):
method __init__ (line 41) | def __init__(self, c1, c2):
method forward (line 47) | def forward(self, feat_l, feat_s):
class FaPNHead (line 59) | class FaPNHead(nn.Module):
method __init__ (line 60) | def __init__(self, in_channels, channel=128, num_classes=19):
method forward (line 73) | def forward(self, features) -> Tensor:
FILE: semseg/models/heads/fcn.py
class FCNHead (line 7) | class FCNHead(nn.Module):
method __init__ (line 8) | def __init__(self, c1, c2, num_classes: int = 19):
method forward (line 13) | def forward(self, features) -> Tensor:
FILE: semseg/models/heads/fpn.py
class FPNHead (line 7) | class FPNHead(nn.Module):
method __init__ (line 11) | def __init__(self, in_channels, channel=128, num_classes=19):
method forward (line 23) | def forward(self, features) -> Tensor:
FILE: semseg/models/heads/hem.py
class _MatrixDecomposition2DBase (line 7) | class _MatrixDecomposition2DBase(nn.Module):
method __init__ (line 8) | def __init__(self, args=dict()):
method _build_bases (line 35) | def _build_bases(self, B, S, D, R, cuda=False):
method local_step (line 38) | def local_step(self, x, bases, coef):
method local_inference (line 42) | def local_inference(self, x, bases):
method compute_coef (line 53) | def compute_coef(self, x, bases, coef):
method forward (line 56) | def forward(self, x, return_bases=False):
class NMF2D (line 99) | class NMF2D(_MatrixDecomposition2DBase):
method __init__ (line 100) | def __init__(self, args=dict()):
method _build_bases (line 105) | def _build_bases(self, B, S, D, R, cuda=False):
method local_step (line 116) | def local_step(self, x, bases, coef):
method compute_coef (line 133) | def compute_coef(self, x, bases, coef):
class Hamburger (line 144) | class Hamburger(nn.Module):
method __init__ (line 145) | def __init__(self, ham_channels=512, ham_kwargs=dict(), norm_cfg=None):
method forward (line 151) | def forward(self, x):
class LightHamHead (line 159) | class LightHamHead(nn.Module):
method __init__ (line 160) | def __init__(self, in_channels=[64, 128, 320, 512], ham_channels=512, ...
method forward (line 177) | def forward(self, inputs):
FILE: semseg/models/heads/lawin.py
class MLP (line 7) | class MLP(nn.Module):
method __init__ (line 8) | def __init__(self, dim=2048, embed_dim=768):
method forward (line 12) | def forward(self, x: Tensor) -> Tensor:
class PatchEmbed (line 18) | class PatchEmbed(nn.Module):
method __init__ (line 19) | def __init__(self, patch_size=4, in_ch=3, dim=96, type='pool') -> None:
method forward (line 35) | def forward(self, x: Tensor) -> Tensor:
class LawinAttn (line 53) | class LawinAttn(nn.Module):
method __init__ (line 54) | def __init__(self, in_ch=512, head=4, patch_size=8, reduction=2) -> None:
method forward (line 72) | def forward(self, query: Tensor, context: Tensor) -> Tensor:
class ConvModule (line 108) | class ConvModule(nn.Module):
method __init__ (line 109) | def __init__(self, c1, c2):
method forward (line 115) | def forward(self, x: Tensor) -> Tensor:
class LawinHead (line 119) | class LawinHead(nn.Module):
method __init__ (line 120) | def __init__(self, in_channels: list, embed_dim=512, num_classes=19) -...
method get_lawin_att_feats (line 144) | def get_lawin_att_feats(self, x: Tensor, patch_size: int):
method forward (line 160) | def forward(self, features):
FILE: semseg/models/heads/segformer.py
class MLP (line 7) | class MLP(nn.Module):
method __init__ (line 8) | def __init__(self, dim, embed_dim):
method forward (line 12) | def forward(self, x: Tensor) -> Tensor:
class ConvModule (line 18) | class ConvModule(nn.Module):
method __init__ (line 19) | def __init__(self, c1, c2):
method forward (line 25) | def forward(self, x: Tensor) -> Tensor:
class SegFormerHead (line 29) | class SegFormerHead(nn.Module):
method __init__ (line 30) | def __init__(self, dims: list, embed_dim: int = 256, num_classes: int ...
method forward (line 39) | def forward(self, features: Tuple[Tensor, Tensor, Tensor, Tensor]) -> ...
FILE: semseg/models/heads/sfnet.py
class AlignedModule (line 8) | class AlignedModule(nn.Module):
method __init__ (line 9) | def __init__(self, c1, c2, k=3):
method forward (line 15) | def forward(self, low_feature: Tensor, high_feature: Tensor) -> Tensor:
method flow_warp (line 25) | def flow_warp(self, x: Tensor, flow: Tensor, size: tuple) -> Tensor:
class SFHead (line 36) | class SFHead(nn.Module):
method __init__ (line 37) | def __init__(self, in_channels, channel=256, num_classes=19, scales=(1...
method forward (line 54) | def forward(self, features: list) -> Tensor:
FILE: semseg/models/heads/upernet.py
class UPerHead (line 9) | class UPerHead(nn.Module):
method __init__ (line 14) | def __init__(self, in_channels, channel=128, num_classes: int = 19, sc...
method forward (line 32) | def forward(self, features: Tuple[Tensor, Tensor, Tensor, Tensor]) -> ...
FILE: semseg/models/layers/common.py
class ConvModule (line 5) | class ConvModule(nn.Sequential):
method __init__ (line 6) | def __init__(self, c1, c2, k, s=1, p=0, d=1, g=1):
class DropPath (line 14) | class DropPath(nn.Module):
method __init__ (line 23) | def __init__(self, p: float = None):
method forward (line 27) | def forward(self, x: Tensor) -> Tensor:
FILE: semseg/models/layers/initialize.py
function _no_grad_trunc_normal_ (line 7) | def _no_grad_trunc_normal_(tensor, mean, std, a, b):
function trunc_normal_ (line 43) | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
FILE: semseg/models/modules/crossatt.py
function exists (line 6) | def exists(val):
function default (line 9) | def default(val, d):
function stable_softmax (line 12) | def stable_softmax(t, dim = -1):
class BidirectionalCrossAttention (line 18) | class BidirectionalCrossAttention(nn.Module):
method __init__ (line 19) | def __init__(self, dim, heads=8, dim_head=64, context_dim=None, dropou...
method forward (line 45) | def forward(self, x, context, return_attn=False, rel_pos_bias=None):
FILE: semseg/models/modules/ffm.py
class ChannelWeights (line 9) | class ChannelWeights(nn.Module):
method __init__ (line 10) | def __init__(self, dim, reduction=1):
method forward (line 21) | def forward(self, x1, x2):
class SpatialWeights (line 32) | class SpatialWeights(nn.Module):
method __init__ (line 33) | def __init__(self, dim, reduction=1):
method forward (line 42) | def forward(self, x1, x2):
class FeatureRectifyModule (line 49) | class FeatureRectifyModule(nn.Module):
method __init__ (line 50) | def __init__(self, dim, reduction=1, lambda_c=.5, lambda_s=.5):
method _init_weights (line 57) | def _init_weights(self, m):
method forward (line 72) | def forward(self, x1, x2):
class CrossAttention (line 81) | class CrossAttention(nn.Module):
method __init__ (line 82) | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None):
method forward (line 93) | def forward(self, x1, x2):
class CrossPath (line 111) | class CrossPath(nn.Module):
method __init__ (line 112) | def __init__(self, dim, reduction=1, num_heads=None, norm_layer=nn.Lay...
method forward (line 124) | def forward(self, x1, x2):
class ChannelEmbed (line 136) | class ChannelEmbed(nn.Module):
method __init__ (line 137) | def __init__(self, in_channels, out_channels, reduction=1, norm_layer=...
method forward (line 150) | def forward(self, x, H, W):
class FeatureFusionModule (line 159) | class FeatureFusionModule(nn.Module):
method __init__ (line 160) | def __init__(self, dim, reduction=1, num_heads=None, norm_layer=nn.Bat...
method _init_weights (line 166) | def _init_weights(self, m):
method forward (line 181) | def forward(self, x1, x2):
FILE: semseg/models/modules/mspa.py
class DWConv (line 9) | class DWConv(nn.Module):
method __init__ (line 10) | def __init__(self, dim=768):
method forward (line 14) | def forward(self, x):
class Mlp (line 18) | class Mlp(nn.Module):
method __init__ (line 19) | def __init__(self, in_features, hidden_features=None, out_features=Non...
method forward (line 29) | def forward(self, x):
class MSPoolAttention (line 40) | class MSPoolAttention(nn.Module):
method __init__ (line 41) | def __init__(self, dim):
method forward (line 51) | def forward(self, x):
class MSPABlock (line 60) | class MSPABlock(nn.Module):
method __init__ (line 61) | def __init__(self, dim, mlp_ratio=4., drop=0., drop_path=0., act_layer...
method forward (line 82) | def forward(self, x):
FILE: semseg/models/modules/ppm.py
class PPM (line 7) | class PPM(nn.Module):
method __init__ (line 10) | def __init__(self, c1, c2=128, scales=(1, 2, 3, 6)):
method forward (line 22) | def forward(self, x: Tensor) -> Tensor:
FILE: semseg/models/modules/psa.py
class PSAP (line 6) | class PSAP(nn.Module):
method __init__ (line 7) | def __init__(self, c1, c2):
method spatial_pool (line 18) | def spatial_pool(self, x: Tensor) -> Tensor:
method channel_pool (line 31) | def channel_pool(self, x: Tensor) -> Tensor:
method forward (line 42) | def forward(self, x: Tensor) -> Tensor:
class PSAS (line 47) | class PSAS(nn.Module):
method __init__ (line 48) | def __init__(self, c1, c2):
method spatial_pool (line 64) | def spatial_pool(self, x: Tensor) -> Tensor:
method channel_pool (line 77) | def channel_pool(self, x: Tensor) -> Tensor:
method forward (line 88) | def forward(self, x: Tensor) -> Tensor:
FILE: semseg/optimizers.py
function get_optimizer (line 5) | def get_optimizer(model: nn.Module, optimizer: str, lr: float, weight_de...
FILE: semseg/schedulers.py
class PolyLR (line 6) | class PolyLR(_LRScheduler):
method __init__ (line 7) | def __init__(self, optimizer, max_iter, decay_iter=1, power=0.9, last_...
method get_lr (line 13) | def get_lr(self):
class WarmupLR (line 21) | class WarmupLR(_LRScheduler):
method __init__ (line 22) | def __init__(self, optimizer, warmup_iter=500, warmup_ratio=5e-4, warm...
method get_lr (line 28) | def get_lr(self):
method get_lr_ratio (line 32) | def get_lr_ratio(self):
method get_main_ratio (line 35) | def get_main_ratio(self):
method get_warmup_ratio (line 38) | def get_warmup_ratio(self):
class WarmupPolyLR (line 45) | class WarmupPolyLR(WarmupLR):
method __init__ (line 46) | def __init__(self, optimizer, power, max_iter, warmup_iter=500, warmup...
method get_main_ratio (line 51) | def get_main_ratio(self):
class WarmupExpLR (line 59) | class WarmupExpLR(WarmupLR):
method __init__ (line 60) | def __init__(self, optimizer, gamma, interval=1, warmup_iter=500, warm...
method get_main_ratio (line 65) | def get_main_ratio(self):
class WarmupCosineLR (line 70) | class WarmupCosineLR(WarmupLR):
method __init__ (line 71) | def __init__(self, optimizer, max_iter, eta_ratio=0, warmup_iter=500, ...
method get_main_ratio (line 76) | def get_main_ratio(self):
function get_scheduler (line 87) | def get_scheduler(scheduler_name: str, optimizer, max_iter: int, power: ...
FILE: semseg/utils/utils.py
function fix_seeds (line 20) | def fix_seeds(seed: int = 3407) -> None:
function setup_cudnn (line 26) | def setup_cudnn() -> None:
function time_sync (line 31) | def time_sync() -> float:
function get_model_size (line 36) | def get_model_size(model: Union[nn.Module, torch.jit.ScriptModule]):
function test_model_latency (line 47) | def test_model_latency(model: nn.Module, inputs: torch.Tensor, use_cuda:...
function count_parameters (line 52) | def count_parameters(model: nn.Module) -> float:
function setup_ddp (line 55) | def setup_ddp():
function cleanup_ddp (line 78) | def cleanup_ddp():
function reduce_tensor (line 82) | def reduce_tensor(tensor: Tensor) -> Tensor:
function throughput (line 89) | def throughput(dataloader, model: nn.Module, times: int = 30):
function show_models (line 103) | def show_models():
function timer (line 110) | def timer(func):
function get_logger (line 125) | def get_logger(log_file=None):
function cal_flops (line 147) | def cal_flops(model, modals, logger):
function print_iou (line 163) | def print_iou(epoch, iou, miou, acc, macc, class_names):
function nchw_to_nlc (line 178) | def nchw_to_nlc(x):
function nlc_to_nchw (line 190) | def nlc_to_nchw(x, hw_shape):
function nlc2nchw2nlc (line 206) | def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
FILE: semseg/utils/visualize.py
function visualize_dataset_sample (line 13) | def visualize_dataset_sample(dataset, root, split='val', batch_size=4):
function generate_palette (line 67) | def generate_palette(num_classes, background: bool = False):
function draw_text (line 77) | def draw_text(image: torch.Tensor, seg_map: torch.Tensor, labels: list, ...
FILE: tools/infer_mm.py
class SemSeg (line 20) | class SemSeg:
method __init__ (line 21) | def __init__(self, cfg) -> None:
method postprocess (line 50) | def postprocess(self, orig_img: Tensor, seg_map: Tensor, overlay: bool...
method model_forward (line 63) | def model_forward(self, img: Tensor) -> Tensor:
method _open_img (line 66) | def _open_img(self, file):
method predict (line 75) | def predict(self, img_fname: str, overlay: bool) -> Tensor:
FILE: tools/train_mm.py
function main (line 26) | def main(cfg, gpu, save_dir):
FILE: tools/val_mm.py
function pad_image (line 24) | def pad_image(img, target_size):
function sliding_predict (line 31) | def sliding_predict(model, image, num_classes, flip=True):
function evaluate (line 65) | def evaluate(model, dataloader, device):
function evaluate_msf (line 88) | def evaluate_msf(model, dataloader, device, scales, flip):
function main (line 123) | def main(cfg):
Condensed preview — 58 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (280K chars).
[
{
"path": ".gitignore",
"chars": 4072,
"preview": "# Repo-specific GitIgnore ----------------------------------------------------------------------------------------------"
},
{
"path": "LICENSE",
"chars": 11347,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 17633,
"preview": "<div align=\"center\"> \n\n## Delivering Arbitrary-Modal Semantic Segmentation (CVPR 2023)\n\n</div>\n\n<p align=\"center\">\n<a hr"
},
{
"path": "configs/deliver_rgbdel.yaml",
"chars": 3490,
"preview": "DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)\nSAVE_DIR "
},
{
"path": "configs/kitti360_rgbdel.yaml",
"chars": 3518,
"preview": "DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)\nSAVE_DIR "
},
{
"path": "configs/mcubes_rgbadn.yaml",
"chars": 2787,
"preview": "DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)\nSAVE_DIR "
},
{
"path": "configs/mfnet_rgbt.yaml",
"chars": 2462,
"preview": "DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)\nSAVE_DIR "
},
{
"path": "configs/nyu_rgbd.yaml",
"chars": 2444,
"preview": "DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)\nSAVE_DIR "
},
{
"path": "configs/urbanlf.yaml",
"chars": 3953,
"preview": "DEVICE : cuda # device used for training and evaluation (cpu, cuda, cuda0, cuda1, ...)\nSAVE_DIR "
},
{
"path": "environment.yaml",
"chars": 4880,
"preview": "name: cmnext\nchannels:\n - pytorch\n - conda-forge\n - defaults\ndependencies:\n - _libgcc_mutex=0.1=main\n - _openmp_mut"
},
{
"path": "requirements.txt",
"chars": 2052,
"preview": "absl-py==1.2.0\naddict==2.4.0\nargon2-cffi==21.3.0\nargon2-cffi-bindings==21.2.0\nasttokens==2.0.5\nattrs==21.4.0\nbackcall==0"
},
{
"path": "semseg/__init__.py",
"chars": 1032,
"preview": "from tabulate import tabulate\nfrom semseg import models\nfrom semseg import datasets\nfrom semseg.models import backbones,"
},
{
"path": "semseg/augmentations.py",
"chars": 13156,
"preview": "import torchvision.transforms.functional as TF \nimport random\nimport math\nimport torch\nfrom torch import Tensor\nfrom typ"
},
{
"path": "semseg/augmentations_mm.py",
"chars": 14530,
"preview": "import torchvision.transforms.functional as TF \nimport random\nimport math\nimport torch\nfrom torch import Tensor\nfrom typ"
},
{
"path": "semseg/datasets/__init__.py",
"chars": 259,
"preview": "from .deliver import DELIVER\nfrom .kitti360 import KITTI360\nfrom .nyu import NYU\nfrom .mfnet import MFNet\nfrom .urbanlf "
},
{
"path": "semseg/datasets/deliver.py",
"chars": 4723,
"preview": "import os\nimport torch \nimport numpy as np\nfrom torch import Tensor\nfrom torch.utils.data import Dataset\nimport torchvis"
},
{
"path": "semseg/datasets/kitti360.py",
"chars": 4781,
"preview": "import os\nimport torch \nimport numpy as np\nfrom torch import Tensor\nfrom torch.utils.data import Dataset\nfrom torchvisio"
},
{
"path": "semseg/datasets/mcubes.py",
"chars": 17473,
"preview": "import os\nimport torch \nimport numpy as np\nfrom torch import Tensor\nfrom torch.utils.data import Dataset\nfrom torchvisio"
},
{
"path": "semseg/datasets/mfnet.py",
"chars": 3296,
"preview": "import os\nimport torch \nimport numpy as np\nfrom torch import Tensor\nfrom torch.utils.data import Dataset\nfrom torchvisio"
},
{
"path": "semseg/datasets/nyu.py",
"chars": 3762,
"preview": "import os\nimport torch \nimport numpy as np\nfrom torch import Tensor\nfrom torch.utils.data import Dataset\nimport torchvis"
},
{
"path": "semseg/datasets/unzip.py",
"chars": 268,
"preview": "import zipfile\n\nwith zipfile.ZipFile(\"data/MCubeS/multimodal_dataset.zip\", \"r\") as zip_ref:\n for name in zip_ref.name"
},
{
"path": "semseg/datasets/urbanlf.py",
"chars": 4286,
"preview": "import os\nimport torch \nimport numpy as np\nfrom torch import Tensor\nfrom torch.utils.data import Dataset\nimport torchvis"
},
{
"path": "semseg/losses.py",
"chars": 3717,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\n\n\nclass CrossEntropy(nn.Module):\n def "
},
{
"path": "semseg/metrics.py",
"chars": 1622,
"preview": "import torch\nfrom torch import Tensor\nfrom typing import Tuple\n\n\nclass Metrics:\n def __init__(self, num_classes: int,"
},
{
"path": "semseg/models/__init__.py",
"chars": 87,
"preview": "from .cmx import CMX\nfrom .cmnext import CMNeXt\n\n__all__ = [\n 'CMX',\n 'CMNeXt',\n]"
},
{
"path": "semseg/models/backbones/__init__.py",
"chars": 88,
"preview": "from .cmx import CMX\nfrom .cmnext import CMNeXt\n\n__all__ = [\n 'CMX', \n 'CMNeXt',\n]"
},
{
"path": "semseg/models/backbones/cmnext.py",
"chars": 15423,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.layers import DropPath"
},
{
"path": "semseg/models/backbones/cmx.py",
"chars": 14316,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nimport einops\nfrom semseg.models.layers i"
},
{
"path": "semseg/models/base.py",
"chars": 3202,
"preview": "import torch\nimport math\nfrom torch import nn\nfrom semseg.models.backbones import *\nfrom semseg.models.layers import tru"
},
{
"path": "semseg/models/cmnext.py",
"chars": 4597,
"preview": "import torch\nfrom torch import Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.base import BaseModel\nfrom"
},
{
"path": "semseg/models/cmx.py",
"chars": 2380,
"preview": "import torch\nfrom torch import Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.base import BaseModel\nfrom"
},
{
"path": "semseg/models/heads/__init__.py",
"chars": 299,
"preview": "from .upernet import UPerHead\nfrom .segformer import SegFormerHead\nfrom .sfnet import SFHead\nfrom .fpn import FPNHead\nfr"
},
{
"path": "semseg/models/heads/condnet.py",
"chars": 2116,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.layers import ConvModu"
},
{
"path": "semseg/models/heads/fapn.py",
"chars": 3145,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom torchvision.ops import DeformConv2d\n"
},
{
"path": "semseg/models/heads/fcn.py",
"chars": 778,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.layers import ConvModu"
},
{
"path": "semseg/models/heads/fpn.py",
"chars": 1472,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.layers import ConvModu"
},
{
"path": "semseg/models/heads/hem.py",
"chars": 6956,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom mmcv.cnn import ConvModule\nfrom fvcore.nn import"
},
{
"path": "semseg/models/heads/lawin.py",
"chars": 7302,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom einops import rearrange\n\n\nclass MLP("
},
{
"path": "semseg/models/heads/segformer.py",
"chars": 1739,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom typing import Tuple\nfrom torch.nn import functional as F\n\n\nclass MLP(nn.M"
},
{
"path": "semseg/models/heads/sfnet.py",
"chars": 2967,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.layers import ConvModu"
},
{
"path": "semseg/models/heads/upernet.py",
"chars": 2091,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom typing import Tuple\nfrom semseg.mode"
},
{
"path": "semseg/models/layers/__init__.py",
"chars": 47,
"preview": "from .common import *\nfrom .initialize import *"
},
{
"path": "semseg/models/layers/common.py",
"chars": 1332,
"preview": "import torch\nfrom torch import nn, Tensor\n\n\nclass ConvModule(nn.Sequential):\n def __init__(self, c1, c2, k, s=1, p=0,"
},
{
"path": "semseg/models/layers/initialize.py",
"chars": 2388,
"preview": "import torch\nimport math\nimport warnings\nfrom torch import nn, Tensor\n\n\ndef _no_grad_trunc_normal_(tensor, mean, std, a,"
},
{
"path": "semseg/models/modules/__init__.py",
"chars": 83,
"preview": "from .ppm import PPM\nfrom .psa import PSAP, PSAS\n\n__all__ = ['PPM', 'PSAP', 'PSAS']"
},
{
"path": "semseg/models/modules/crossatt.py",
"chars": 3878,
"preview": "import torch\nfrom torch import nn\nfrom einops import rearrange\nfrom torch import einsum\n\ndef exists(val):\n return val"
},
{
"path": "semseg/models/modules/ffm.py",
"chars": 8019,
"preview": "import torch\nimport torch.nn as nn\n\nfrom timm.models.layers import trunc_normal_\nimport math\n\n\n# Feature Rectify Module\n"
},
{
"path": "semseg/models/modules/mspa.py",
"chars": 3986,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.autograd\nfrom timm.models.layers import "
},
{
"path": "semseg/models/modules/ppm.py",
"chars": 1043,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\nfrom semseg.models.layers import ConvModu"
},
{
"path": "semseg/models/modules/psa.py",
"chars": 6861,
"preview": "import torch\nfrom torch import nn, Tensor\nfrom torch.nn import functional as F\n\n\nclass PSAP(nn.Module):\n def __init__"
},
{
"path": "semseg/optimizers.py",
"chars": 674,
"preview": "from torch import nn\nfrom torch.optim import AdamW, SGD\n\n\ndef get_optimizer(model: nn.Module, optimizer: str, lr: float,"
},
{
"path": "semseg/schedulers.py",
"chars": 4272,
"preview": "import torch\nimport math\nfrom torch.optim.lr_scheduler import _LRScheduler\n\n\nclass PolyLR(_LRScheduler):\n def __init_"
},
{
"path": "semseg/utils/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "semseg/utils/utils.py",
"chars": 8498,
"preview": "import torch\nimport numpy as np\nimport random\nimport time\nimport os\nimport sys\nimport functools\nfrom pathlib import Path"
},
{
"path": "semseg/utils/visualize.py",
"chars": 4694,
"preview": "import torch\nimport random\nimport numpy as np\nimport torch\nimport matplotlib.pyplot as plt\nfrom torch.utils.data import "
},
{
"path": "tools/infer_mm.py",
"chars": 6300,
"preview": "import torch\nimport argparse\nimport yaml\nimport math\nfrom torch import Tensor\nfrom torch.nn import functional as F\nfrom "
},
{
"path": "tools/train_mm.py",
"chars": 8840,
"preview": "import os\nimport torch \nimport argparse\nimport yaml\nimport time\nimport multiprocessing as mp\nfrom tabulate import tabula"
},
{
"path": "tools/val_mm.py",
"chars": 7499,
"preview": "import torch\nimport argparse\nimport yaml\nimport math\nimport os\nimport time\nfrom pathlib import Path\nfrom tqdm import tqd"
}
]
About this extraction
This page contains the full source code of the jamycheung/DELIVER GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 58 files (262.6 KB), approximately 79.7k tokens, and a symbol index with 463 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.