Repository: z1069614715/objectdetection_script
Branch: master
Commit: 02ba8c6fb2ad
Files: 351
Total size: 2.2 MB
Directory structure:
gitextract_1c2iago4/
├── .gitignore
├── Ultralytics-YOLO-project.md
├── bilibili-guide.md
├── cv-attention/
│ ├── A2Attention.py
│ ├── BAM.py
│ ├── Biformer.py
│ ├── CAA.py
│ ├── CBAM.py
│ ├── CPCA.py
│ ├── CloAttention.py
│ ├── CoTAttention.py
│ ├── CoordAttention.py
│ ├── DAttention.py
│ ├── ECA.py
│ ├── ELA.py
│ ├── EMA.py
│ ├── EffectiveSE.py
│ ├── GAM.py
│ ├── GC.py
│ ├── GE.py
│ ├── LSKA.py
│ ├── LSKBlock.py
│ ├── MHSA.py
│ ├── MLCA.py
│ ├── MobileViTAttention.py
│ ├── ParNetAttention.py
│ ├── PolarizedSelfAttention.py
│ ├── S2Attention.py
│ ├── SE.py
│ ├── SGE.py
│ ├── SK.py
│ ├── SequentialSelfAttention.py
│ ├── ShuffleAttention.py
│ ├── SimAM.py
│ ├── TripletAttention.py
│ └── readme.md
├── cvpr2025-deim-project.md
├── damo-yolo/
│ ├── Annotations/
│ │ └── ReadMe.md
│ ├── JPEGImages/
│ │ └── ReadMe.md
│ ├── readme.md
│ └── voc2coco.py
├── data-offline-aug/
│ ├── object_detection_data_aug.py
│ ├── readme.md
│ └── segment_data_aug.py
├── mmdet-course/
│ ├── config/
│ │ ├── atss_r50_fpn_dyhead_1x_visdrone.py
│ │ ├── cascade-rcnn_r50_fpn_1x_visdrone.py
│ │ ├── ddq-detr-4scale_r50_8xb2-12e_visdrone.py
│ │ ├── dino-4scale_r50_8xb2-12e_visdrone.py
│ │ ├── faster-rcnn_r50_fpn_ciou_1x_visdrone.py
│ │ ├── gfl_r50_fpn_1x_visdrone.py
│ │ ├── retinanet_r50_fpn_1x_visdrone.py
│ │ ├── rtmdet_tiny_8xb32-300e_visdrone.py
│ │ ├── tood_r50_fpn_1x_visdrone.py
│ │ └── yolox_tiny_8xb8-300e_visdrone.py
│ ├── mmdet2yolo.py
│ ├── readme.md
│ └── yolo2coco.py
├── module-info/
│ ├── CVPR2023-SMPConv.md
│ ├── CVPR2024-DCMPNet.md
│ ├── CVPR2024-FADC.md
│ ├── CVPR2024-PKINet.md
│ ├── CVPR2024-ParameterNet.md
│ ├── CVPR2024-RMT.md
│ ├── CVPR2024-RepVIT.md
│ ├── CVPR2024-Rewrite the Stars.md
│ ├── CVPR2024-SFSConv.md
│ ├── CVPR2024-TransNext.md
│ ├── CVPR2024-UniRepLKNet.md
│ ├── CVPR2025-BHViT.md
│ ├── CVPR2025-DarkIR.md
│ ├── CVPR2025-EVSSM.md
│ ├── CVPR2025-EfficientViM.md
│ ├── CVPR2025-FDConv.md
│ ├── CVPR2025-GroupMamba.md
│ ├── CVPR2025-LSNet.md
│ ├── CVPR2025-MambaIRV2.md
│ ├── CVPR2025-MambaOut.md
│ ├── CVPR2025-MambaVision.md
│ ├── CVPR2025-MobileMamba.md
│ ├── CVPR2025-Mona.md
│ ├── CVPR2025-OverLoCK.md
│ ├── CVPR2025-SCSegamba.md
│ ├── CVPR2025-Transformers without Normalization.md
│ ├── CVPR2025-vHeat.md
│ ├── ICLR2025-Pola.md
│ ├── ICLR2025-ToST.md
│ └── TPAMI2025-HyperYOLO.md
├── mutilmodel-project.md
├── objectdetection-tricks/
│ ├── readme.md
│ ├── tricks_1.py
│ ├── tricks_10.py
│ ├── tricks_11.py
│ ├── tricks_12.py
│ ├── tricks_13.py
│ ├── tricks_14.py
│ ├── tricks_15.py
│ ├── tricks_16.py
│ ├── tricks_2.py
│ ├── tricks_3.py
│ ├── tricks_4.py
│ ├── tricks_5.py
│ ├── tricks_6.py
│ ├── tricks_7.py
│ ├── tricks_8.py
│ └── tricks_9.py
├── readme.md
├── visdrone2019-benchmark/
│ └── readme.md
├── yolo/
│ ├── data.yaml
│ ├── dataset/
│ │ ├── VOCdevkit/
│ │ │ ├── Annotations/
│ │ │ │ └── ReadMe.md
│ │ │ ├── JPEGImages/
│ │ │ │ └── ReadMe.md
│ │ │ └── txt/
│ │ │ └── ReadMe.md
│ │ ├── split_data.py
│ │ └── xml2txt.py
│ └── readme.md
├── yolo-gradcam/
│ ├── README.md
│ ├── yolov11_heatmap.py
│ ├── yolov5_heatmap.py
│ ├── yolov7_heatmap.py
│ ├── yolov8_heatmap.py
│ └── yolov9_heatmap.py
└── yolo-improve/
├── CAM.py
├── iou.py
├── paper.md
├── readme.md
├── rtdetr-compress.md
├── rtdetr-distill.md
├── rtdetr-project.md
├── ultralytics-yolo/
│ ├── get_COCO_metrice.py
│ ├── heatmap.py
│ ├── requirements.txt
│ ├── train.py
│ ├── val.py
│ └── yolo2coco.py
├── yolov11-project.md
├── yolov5-AIFI.py
├── yolov5-AUX/
│ ├── benchmarks.py
│ ├── data/
│ │ ├── Argoverse.yaml
│ │ ├── GlobalWheat2020.yaml
│ │ ├── ImageNet.yaml
│ │ ├── Objects365.yaml
│ │ ├── SKU-110K.yaml
│ │ ├── VOC.yaml
│ │ ├── VisDrone.yaml
│ │ ├── coco.yaml
│ │ ├── coco128-seg.yaml
│ │ ├── coco128.yaml
│ │ ├── hyps/
│ │ │ ├── hyp.Objects365.yaml
│ │ │ ├── hyp.VOC.yaml
│ │ │ ├── hyp.no-augmentation.yaml
│ │ │ ├── hyp.scratch-high.yaml
│ │ │ ├── hyp.scratch-low.yaml
│ │ │ └── hyp.scratch-med.yaml
│ │ ├── scripts/
│ │ │ ├── download_weights.sh
│ │ │ ├── get_coco.sh
│ │ │ ├── get_coco128.sh
│ │ │ └── get_imagenet.sh
│ │ └── xView.yaml
│ ├── detect.py
│ ├── export.py
│ ├── hubconf.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── experimental.py
│ │ ├── hub/
│ │ │ ├── anchors.yaml
│ │ │ ├── yolov3-spp.yaml
│ │ │ ├── yolov3-tiny.yaml
│ │ │ ├── yolov3.yaml
│ │ │ ├── yolov5-bifpn.yaml
│ │ │ ├── yolov5-fpn.yaml
│ │ │ ├── yolov5-p2.yaml
│ │ │ ├── yolov5-p34.yaml
│ │ │ ├── yolov5-p6.yaml
│ │ │ ├── yolov5-p7.yaml
│ │ │ ├── yolov5-panet.yaml
│ │ │ ├── yolov5l6.yaml
│ │ │ ├── yolov5m6.yaml
│ │ │ ├── yolov5n6.yaml
│ │ │ ├── yolov5s-LeakyReLU.yaml
│ │ │ ├── yolov5s-ghost.yaml
│ │ │ ├── yolov5s-transformer.yaml
│ │ │ ├── yolov5s6.yaml
│ │ │ └── yolov5x6.yaml
│ │ ├── segment/
│ │ │ ├── yolov5l-seg.yaml
│ │ │ ├── yolov5m-seg.yaml
│ │ │ ├── yolov5n-seg.yaml
│ │ │ ├── yolov5s-seg.yaml
│ │ │ └── yolov5x-seg.yaml
│ │ ├── tf.py
│ │ ├── yolo.py
│ │ ├── yolov5_aux.yaml
│ │ ├── yolov5l.yaml
│ │ ├── yolov5m.yaml
│ │ ├── yolov5n.yaml
│ │ ├── yolov5s.yaml
│ │ └── yolov5x.yaml
│ ├── train.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── activations.py
│ │ ├── augmentations.py
│ │ ├── autoanchor.py
│ │ ├── autobatch.py
│ │ ├── aws/
│ │ │ ├── __init__.py
│ │ │ ├── mime.sh
│ │ │ ├── resume.py
│ │ │ └── userdata.sh
│ │ ├── callbacks.py
│ │ ├── dataloaders.py
│ │ ├── docker/
│ │ │ ├── Dockerfile
│ │ │ ├── Dockerfile-arm64
│ │ │ └── Dockerfile-cpu
│ │ ├── downloads.py
│ │ ├── flask_rest_api/
│ │ │ ├── README.md
│ │ │ ├── example_request.py
│ │ │ └── restapi.py
│ │ ├── general.py
│ │ ├── google_app_engine/
│ │ │ ├── Dockerfile
│ │ │ ├── additional_requirements.txt
│ │ │ └── app.yaml
│ │ ├── loggers/
│ │ │ ├── __init__.py
│ │ │ ├── clearml/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── clearml_utils.py
│ │ │ │ └── hpo.py
│ │ │ └── comet/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── comet_utils.py
│ │ │ ├── hpo.py
│ │ │ └── optimizer_config.json
│ │ ├── loss.py
│ │ ├── metrics.py
│ │ ├── plots.py
│ │ ├── segment/
│ │ │ ├── __init__.py
│ │ │ ├── augmentations.py
│ │ │ ├── dataloaders.py
│ │ │ ├── general.py
│ │ │ ├── loss.py
│ │ │ ├── metrics.py
│ │ │ └── plots.py
│ │ ├── torch_utils.py
│ │ └── triton.py
│ └── val.py
├── yolov5-C3RFEM.py
├── yolov5-CARAFE.py
├── yolov5-CCFM.py
├── yolov5-ContextAggregation.py
├── yolov5-CoordConv.py
├── yolov5-DBB.py
├── yolov5-DCN.py
├── yolov5-DCNV3/
│ ├── commod.py
│ └── ops_dcnv3/
│ ├── functions/
│ │ ├── __init__.py
│ │ └── dcnv3_func.py
│ ├── make.sh
│ ├── modules/
│ │ ├── __init__.py
│ │ └── dcnv3.py
│ ├── setup.py
│ ├── src/
│ │ ├── cpu/
│ │ │ ├── dcnv3_cpu.cpp
│ │ │ └── dcnv3_cpu.h
│ │ ├── cuda/
│ │ │ ├── dcnv3_cuda.cu
│ │ │ ├── dcnv3_cuda.h
│ │ │ └── dcnv3_im2col_cuda.cuh
│ │ ├── dcnv3.h
│ │ └── vision.cpp
│ └── test.py
├── yolov5-DSConv.py
├── yolov5-DecoupledHead.py
├── yolov5-DySnakeConv.py
├── yolov5-EVC.py
├── yolov5-FasterBlock.py
├── yolov5-GFPN/
│ ├── extra_modules.py
│ └── yolov5_GFPN.yaml
├── yolov5-GOLDYOLO/
│ ├── common.py
│ ├── yolo.py
│ ├── yolov5n-goldyolo.yaml
│ ├── yolov7-goldyolo.yaml
│ └── yolov7-tiny-goldyolo.yaml
├── yolov5-NWD.py
├── yolov5-OTA/
│ └── loss.py
├── yolov5-RepNCSPELAN.py
├── yolov5-SAConv.py
├── yolov5-TSCODE.py
├── yolov5-aLRPLoss.py
├── yolov5-asf.py
├── yolov5-backbone/
│ ├── CVPR2023-EfficientViT/
│ │ └── EfficientViT.py
│ ├── CVPR2024-StarNet/
│ │ └── starnet.py
│ ├── ConvNextV2/
│ │ └── convnextv2.py
│ ├── EMO/
│ │ └── emo.py
│ ├── EfficientFormerV2/
│ │ └── EfficientFormerV2.py
│ ├── EfficientViT/
│ │ └── efficientViT.py
│ ├── FocalNet/
│ │ └── FocalNet.py
│ ├── LSKNet/
│ │ └── lsknet.py
│ ├── MobileNetV4/
│ │ └── mobilenetv4.py
│ ├── NextViT/
│ │ └── NextViT.py
│ ├── ODConv/
│ │ ├── od_mobilenetv2.py
│ │ ├── od_resnet.py
│ │ └── odconv.py
│ ├── ODConvFuse/
│ │ ├── od_mobilenetv2.py
│ │ ├── od_resnet.py
│ │ └── odconv.py
│ ├── PoolFormer/
│ │ └── poolformer.py
│ ├── RIFormer/
│ │ └── RIFormer.py
│ ├── RepViT/
│ │ └── repvit.py
│ ├── SwinTransformer/
│ │ └── SwinTransformer.py
│ ├── UniRepLKNet/
│ │ └── unireplknet.py
│ ├── VanillaNet/
│ │ └── VanillaNet.py
│ ├── fasternet/
│ │ ├── faster_cfg/
│ │ │ ├── fasternet_l.yaml
│ │ │ ├── fasternet_m.yaml
│ │ │ ├── fasternet_s.yaml
│ │ │ ├── fasternet_t0.yaml
│ │ │ ├── fasternet_t1.yaml
│ │ │ └── fasternet_t2.yaml
│ │ └── fasternet.py
│ ├── inceptionnext/
│ │ └── inceptionnext.py
│ ├── main.py
│ ├── yolo.py
│ └── yolov5-custom.yaml
├── yolov5-dyhead.py
├── yolov5-res2block.py
├── yolov5-softnms.py
├── yolov5v7-light.md
├── yolov7-CoordConv.py
├── yolov7-DBB.py
├── yolov7-DCN.py
├── yolov7-DCNV3.py
├── yolov7-DSConv.py
├── yolov7-DecoupledHead.py
├── yolov7-DySnakeConv.py
├── yolov7-EVC.py
├── yolov7-MPDiou.py
├── yolov7-NWD.py
├── yolov7-PConv.py
├── yolov7-RFEM.py
├── yolov7-RepNCSPELAN.py
├── yolov7-SAConv.py
├── yolov7-asf.py
├── yolov7-head/
│ ├── yolov7-tiny-5-heads.yaml
│ ├── yolov7-tiny-P2.yaml
│ └── yolov7-tiny-P6.yaml
├── yolov7-iou.py
├── yolov7-odconv.py
├── yolov7-slimneck.py
├── yolov7-softnms.py
├── yolov8-DCN.py
├── yolov8-compress.md
├── yolov8-distill.md
├── yolov8-erf.py
├── yolov8-objectcount.py
├── yolov8-track.py
├── yolov8.py
├── yolov8v10-project.md
└── yolov9-backbone/
├── yolo.py
└── yolov9-c-custom.yaml
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# Profiling
*.pclprof
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
.idea
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# VSCode project settings
.vscode/
# Rope project settings
.ropeproject
# mkdocs documentation
/site
mkdocs_github_authors.yaml
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# datasets and projects
datasets/
runs/
wandb/
tests/
logs/
.DS_Store
# Neural Network weights -----------------------------------------------------------------------------------------------
weights/
*.weights
*.pt
*.pb
*.onnx
*.engine
*.mlmodel
*.mlpackage
*.torchscript
*.tflite
*.h5
*_saved_model/
*_web_model/
*_openvino_model/
*_paddle_model/
pnnx*
# Autogenerated files for tests
/ultralytics/assets/
# dataset cache
*.cache
================================================
FILE: Ultralytics-YOLO-project.md
================================================
# Ultralytics-YOLO项目详细说明
1. 本项目集成了YOLOv8、v10、v11、v12乃至前沿的YOLO26等全系列基础模型。 无论是做横向对比实验,还是纵向的版本改进,无需到处找资源,一个项目就能满足你所有的实验需求!
2. 核心代码已实现高度模块化与解耦,专为新手优化。 你完全不需要死磕底层复杂代码,只需像搭积木一样简单修改YAML配置文件,就能轻松实现各种改进模块的自由组合。
3. 面对日益内卷的YOLO赛道,简单的“缝合”已难满足毕业要求。 本项目不仅提供现成的创新方案,更配套独家“二次创新”课程,授人以渔。我们将手把手教你掌握模块设计的底层逻辑,助你从“模仿者”进阶为“创造者”,设计出独属于你的创新模块。
4. 针对有代码基础但受困于Ultralytics复杂架构的同学, 本项目引入了来自DFine、DEIM项目中成熟的“万物皆可融”架构思想。你无需纠结模块注册等信息,只需遵循我所提供的标准接口规范,即可将自定义魔改模块无缝融入YAML配置,与各类CSP变种灵活结合。
5. 实验跑通了,却不知道如何写创新点? 本项目将定期拆解高分论文,传授写作心法,教你如何将实验成果转化为逻辑严密、亮点突出的高质量学术论文,解决写作难题!
6. 毕业设计缺少高大上的展示界面? 别担心,项目会内置基于PyQt或HTML的通用可视化界面,开箱即用,完美补齐毕业论文的最后一块拼图,助你从容应对答辩!
7. 购买即享专属技术交流群, 这里有业内公认的高效答疑服务,以及志同道合的伙伴互助交流。拒绝闭门造车,让我们带你避开深坑,高效通关!
## 针对于已经入手了yolov8/yolo11项目的同学来说,如果你有以下几点需求,可以考虑追加入手!
1. 想用最新的YOLO26做实验!而且本项目支持v8、v10、11、12、26全系列版本!
2. 想深入学习改进创新的同学,本项目会附带二次创新的通用教程,手把手教你设计出属于自己的创新模块!
3. 做完实验不知道怎么写论文?本项目会定期拆解高分论文案例,教你如何把实验结果写成逻辑清晰、亮点突出的高质量学术论文
4. 想自己魔改模块的同学!本项目提供超级简单的模块注册方式,只需按照教程操作,就能轻松注册自己的模块,还能和各种CSP变种随意组合!
## 模块列表(这些模块均已在代码中注册好,只需要修改yaml可以直接实验)
- ultralytics/nn/extra_modules/attention
1. ultralytics/nn/extra_modules/attention/SEAM.py
2. CVPR2021|ultralytics/nn/extra_modules/attention/ca.py
3. ICASSP2023|ultralytics/nn/extra_modules/attention/ema.py
4. ICML2021|ultralytics/nn/extra_modules/attention/simam.py
5. ICCV2023|ultralytics/nn/extra_modules/attention/lsk.py
6. WACV2024|ultralytics/nn/extra_modules/attention/DeformableLKA.py
7. ultralytics/nn/extra_modules/attention/mlca.py
8. BIBM2024|ultralytics/nn/extra_modules/attention/FSA.py
9. AAAI2025|ultralytics/nn/extra_modules/attention/CDFA.py
10. TGRS2025|ultralytics/nn/extra_modules/attention/MCA.py
11. CVPR2025|ultralytics/nn/extra_modules/attention/CASAB.py
12. NN2025|ultralytics/nn/extra_modules/attention/KSFA.py
13. TPAMI2025|ultralytics/nn/extra_modules/attention/GQL.py
14. TGRS2025|ultralytics/nn/extra_modules/attention/ACA.py
15. TGRS2025|ultralytics/nn/extra_modules/attention/DHPF.py
16. TGRS2025|ultralytics/nn/extra_modules/attention/ACAB.py
- ultralytics/nn/extra_modules/conv_module(此部分内容教程可以看GuideVideo-MG.md中的改进模块-使用教程的第五节)
1. CVPR2021|ultralytics/nn/extra_modules/conv_module/dbb.py
2. TIP2024|ultralytics/nn/extra_modules/conv_module/deconv.py
3. ICCV2023|ultralytics/nn/extra_modules/conv_module/dynamic_snake_conv.py
4. CVPR2023|ultralytics/nn/extra_modules/conv_module/pconv.py
5. AAAI2025|ultralytics/nn/extra_modules/conv_module/psconv.py
6. CVPR2025|ultralytics/nn/extra_modules/conv_module/ShiftwiseConv.py
7. ultralytics/nn/extra_modules/conv_module/wdbb.py
8. ultralytics/nn/extra_modules/conv_module/deepdbb.py
9. ECCV2024|ultralytics/nn/extra_modules/conv_module/wtconv2d.py
10. CVPR2023|ultralytics/nn/extra_modules/conv_module/ScConv.py
11. ultralytics/nn/extra_modules/conv_module/dcnv2.py
12. CVPR2024|ultralytics/nn/extra_modules/conv_module/DilatedReparamConv.py
13. ultralytics/nn/extra_modules/conv_module/gConv.py
14. CVPR2024|ultralytics/nn/extra_modules/conv_module/IDWC.py
15. ultralytics/nn/extra_modules/conv_module/DSA.py
16. CVPR2025|ultralytics/nn/extra_modules/conv_module/FDConv.py
17. CVPR2023|ultralytics/nn/extra_modules/conv_module/dcnv3.py
18. CVPR2024|ultralytics/nn/extra_modules/conv_module/dcnv4.py
19. CVPR2024|ultralytics/nn/extra_modules/conv_module/DynamicConv.py
20. CVPR2024|ultralytics/nn/extra_modules/conv_module/FADC.py
21. CVPR2023|ultralytics/nn/extra_modules/conv_module/SMPConv.py
22. MIA2025|ultralytics/nn/extra_modules/conv_module/FourierConv.py
23. CVPR2024|ultralytics/nn/extra_modules/conv_module/SFSConv.py
24. ICCV2025|ultralytics/nn/extra_modules/conv_module/MBRConv.py
25. ICCV2025|ultralytics/nn/extra_modules/conv_module/ConvAttn.py
26. ICCV2025|ultralytics/nn/extra_modules/conv_module/Converse2D.py
27. CVPR2025|ultralytics/nn/extra_modules/conv_module/gcconv.py
28. ACCV2024|ultralytics/nn/extra_modules/conv_module/RMBC.py
29. CVPR2026|ultralytics/nn/extra_modules/conv_module/DEGConv.py
- engine/extre_module/custom_nn/stem
1. ultralytics/nn/extra_modules/stem/SRFD.py
2. ultralytics/nn/extra_modules/stem/LoG.py
3. ICCV2023|ultralytics/nn/extra_modules/stem/RepStem.py
- ultralytics/nn/extra_modules/upsample
1. CVPR2024|ultralytics/nn/extra_modules/upsample/eucb.py
2. CVPR2024|ultralytics/nn/extra_modules/upsample/eucb_sc.py
3. ultralytics/nn/extra_modules/upsample/WaveletUnPool.py
4. ICCV2019|ultralytics/nn/extra_modules/upsample/CARAFE.py
5. ICCV2023|ultralytics/nn/extra_modules/upsample/DySample.py
6. ICCV2025|ultralytics/nn/extra_modules/upsample/Converse2D_Up.py
7. CVPR2025|ultralytics/nn/extra_modules/upsample/DSUB.py
- ultralytics/nn/extra_modules/downsample
1. TIP2020|ultralytics/nn/extra_modules/downsample/gcnet.py
2. 自研模块|ultralytics/nn/extra_modules/downsample/lawds.py
3. ultralytics/nn/extra_modules/downsample/WaveletPool.py
4. ultralytics/nn/extra_modules/downsample/ADown.py
5. ultralytics/nn/extra_modules/downsample/YOLOV7Down.py
6. ultralytics/nn/extra_modules/downsample/SPDConv.py
7. ultralytics/nn/extra_modules/downsample/HWD.py
8. ultralytics/nn/extra_modules/downsample/DRFD.py
9. TGRS2025|ultralytics/nn/extra_modules/conv_module/FSConv.py
- ultralytics/nn/extra_modules/module
1. AAAI2025|ultralytics/nn/extra_modules/module/APBottleneck.py
2. CVPR2025|ultralytics/nn/extra_modules/module/efficientVIM.py
3. CVPR2023|ultralytics/nn/extra_modules/module/fasterblock.py
4. CVPR2024|ultralytics/nn/extra_modules/module/starblock.py
5. ultralytics/nn/extra_modules/module/DWR.py
6. CVPR2024|ultralytics/nn/extra_modules/module/UniRepLKBlock.py
7. CVPR2025|ultralytics/nn/extra_modules/module/mambaout.py
8. AAAI2024|ultralytics/nn/extra_modules/module/DynamicFilter.py
9. ultralytics/nn/extra_modules/module/StripBlock.py
10. TGRS2024|ultralytics/nn/extra_modules/module/elgca.py
11. CVPR2024|ultralytics/nn/extra_modules/module/LEGM.py
12. ICCV2023|ultralytics/nn/extra_modules/module/iRMB.py
13. TPAMI2025|ultralytics/nn/extra_modules/module/MSBlock.py
14. ICLR2024|ultralytics/nn/extra_modules/module/FATBlock.py
15. CVPR2024|ultralytics/nn/extra_modules/module/MSCB.py
16. ultralytics/nn/extra_modules/module/LEGBlock.py
17. ultralytics/nn/extra_modules/module/GLSA.py
18. CVPR2025|ultralytics/nn/extra_modules/module/RCB.py
19. ECCV2024|ultralytics/nn/extra_modules/module/JDPM.py
20. CVPR2025|ultralytics/nn/extra_modules/module/vHeat.py
21. CVPR2025|ultralytics/nn/extra_modules/module/EBlock.py
22. CVPR2025|ultralytics/nn/extra_modules/module/DBlock.py
23. ECCV2024|ultralytics/nn/extra_modules/module/FMB.py
24. CVPR2024|ultralytics/nn/extra_modules/module/IDWB.py
25. ECCV2022|ultralytics/nn/extra_modules/module/LFE.py
26. AAAI2025|ultralytics/nn/extra_modules/module/FCM.py
27. CVPR2024|ultralytics/nn/extra_modules/module/RepViTBlock.py
28. CVPR2024|ultralytics/nn/extra_modules/module/PKIModule.py
29. CVPR2024|ultralytics/nn/extra_modules/module/camixer.py
30. ICCV2025|ultralytics/nn/extra_modules/module/ESC.py
31. CVPR2025|ultralytics/nn/extra_modules/module/nnWNet.py
32. TGRS2025|ultralytics/nn/extra_modules/module/ARF.py
33. AAAI2024|ultralytics/nn/extra_modules/module/CFBlock.py
34. IJCV2024|ultralytics/nn/extra_modules/module/FMA.py
35. ultralytics/nn/extra_modules/module/LWGA.py
36. TGRS2025|ultralytics/nn/extra_modules/module/CSSC.py
37. TGRS2025|ultralytics/nn/extra_modules/module/CNCM.py
38. ICCV2025|ultralytics/nn/extra_modules/module/HFRB.py
39. ICIP2025|ultralytics/nn/extra_modules/module/EVA.py
40. CVPR2025|ultralytics/nn/extra_modules/module/IEL.py
41. MICCAI2023|ultralytics/nn/extra_modules/module/MFEBlock.py
42. AAAI2026|ultralytics/nn/extra_modules/module/PartialNetBlock.py
43. TGRS2025|ultralytics/nn/extra_modules/module/DRG.py
44. ultralytics/nn/extra_modules/module/Wave2D.py
45. TGRS2025|ultralytics/nn/extra_modules/module/GLGM.py
46. TGRS2025|ultralytics/nn/extra_modules/module/MAC.py
47. AAAI2026|ultralytics/nn/extra_modules/module/SPJFB.py
- ultralytics/nn/extra_modules/block
1. ultralytics/nn/extra_modules/block/CSPBlock.py
2. TPAMI2025|ultralytics/nn/extra_modules/block/MANet.py
3. TPAMI2024|ultralytics/nn/extra_modules/block/MetaFormer.py
- ultralytics/nn/extra_modules/transformer
1. ICLR2025|ultralytics/nn/extra_modules/transformer/PolaLinearAttention.py
2. CVPR2023|ultralytics/nn/extra_modules/transformer/biformer.py
3. CVPR2023|ultralytics/nn/extra_modules/transformer/CascadedGroupAttention.py
4. CVPR2022|ultralytics/nn/extra_modules/transformer/DAttention.py
5. ICLR2022|ultralytics/nn/extra_modules/transformer/DPBAttention.py
6. CVPR2024|ultralytics/nn/extra_modules/transformer/AdaptiveSparseSA.py
7. ultralytics/nn/extra_modules/transformer/GSA.py
8. ultralytics/nn/extra_modules/transformer/RSA.py
9. ECCV2024|ultralytics/nn/extra_modules/transformer/FSSA.py
10. AAAI2025|ultralytics/nn/extra_modules/transformer/DilatedGCSA.py
11. AAAI2025|ultralytics/nn/extra_modules/transformer/DilatedMWSA.py
12. CVPR2024|ultralytics/nn/extra_modules/transformer/SHSA.py
13. IJCAI2024|ultralytics/nn/extra_modules/transformer/CTA.py
14. IJCAI2024|ultralytics/nn/extra_modules/transformer/SFA.py
15. ultralytics/nn/extra_modules/transformer/MSLA.py
16. ACMMM2025|ultralytics/nn/extra_modules/transformer/CPIA_SA.py
17. NN2025|ultralytics/nn/extra_modules/transformer/TokenSelectAttention.py
18. CVPR2025|ultralytics/nn/extra_modules/transformer/TAB.py
19. TPAMI2025|ultralytics/nn/extra_modules/transformer/LRSA.py
20. ICCV2025|ultralytics/nn/extra_modules/transformer/MALA.py
21. ICML2023|ultralytics/nn/extra_modules/transformer/MUA.py
22. ACMMM2025|ultralytics/nn/extra_modules/transformer/EGSA.py
23. ACMMM2025|ultralytics/nn/extra_modules/transformer/SWSA.py
24. AAAI2026|ultralytics/nn/extra_modules/transformer/DHOGSA.py
25. NeurIPS2025|ultralytics/nn/extra_modules/transformer/CBSA.py
26. TGRS2025|ultralytics/nn/extra_modules/transformer/DPWA.py
27. TIP2025|ultralytics/nn/extra_modules/transformer/DWM_MSA.py
28. CVPR2026|ultralytics/nn/extra_modules/transformer/BinaryAttention.py
29. CVPR2025|ultralytics/nn/extra_modules/transformer/wca.py
- ultralytics/nn/extra_modules/mamba
1. AAAI2025|ultralytics/nn/extra_modules/mamba/SS2D.py
2. CVPR2025|ultralytics/nn/extra_modules/mamba/ASSM.py
3. CVPR2025|ultralytics/nn/extra_modules/mamba/SAVSS.py
4. CVPR2025|ultralytics/nn/extra_modules/mamba/MobileMamba/mobilemamba.py
5. CVPR2025|ultralytics/nn/extra_modules/mamba/MaIR.py
6. TGRS2025|ultralytics/nn/extra_modules/mamba/GLVSS.py
7. ICCV2025|ultralytics/nn/extra_modules/mamba/VSSD.py
8. ICCV2025|ultralytics/nn/extra_modules/mamba/TinyViM.py
9. INFFUS2025|ultralytics/nn/extra_modules/mamba/CSI.py
10. TIP2025|ultralytics/nn/extra_modules/mamba/SFMB.py
11. TGRS2025|ultralytics/nn/extra_modules/mamba/GLSS.py
12. TGRS2025|ultralytics/nn/extra_modules/mamba/GLSS2D.py
13. CVPR2026|ultralytics/nn/extra_modules/mamba/TransMixer.py
- ultralytics/nn/extra_modules/mlp
1. CVPR2024|ultralytics/nn/extra_modules/mlp/ConvolutionalGLU.py
2. IJCAI2024|ultralytics/nn/extra_modules/mlp/DFFN.py
3. ICLR2024|ultralytics/nn/extra_modules/mlp/FMFFN.py
4. CVPR2024|ultralytics/nn/extra_modules/mlp/FRFN.py
5. ECCV2024|ultralytics/nn/extra_modules/mlp/EFFN.py
6. WACV2025|ultralytics/nn/extra_modules/mlp/SEFN.py
7. ICLR2025|ultralytics/nn/extra_modules/mlp/KAN.py
8. CVPR2025|ultralytics/nn/extra_modules/mlp/EDFFN.py
9. ICVJ2024|ultralytics/nn/extra_modules/mlp/DML.py
10. AAAI2026|ultralytics/nn/extra_modules/mlp/DIFF.py
- ultralytics/nn/extra_modules/neck
1. ultralytics/nn/extra_modules/neck/ASF.py
2. ultralytics/nn/extra_modules/neck/BiFPN.py
3. AAAI2022|ultralytics/nn/extra_modules/neck/CTrans.py
4. ultralytics/nn/extra_modules/neck/EfficientRepBiPAN.py
5. ultralytics/nn/extra_modules/neck/GFPN.py
6. ultralytics/nn/extra_modules/neck/HSFPN.py
7. AAAI2025|ultralytics/nn/extra_modules/neck/HS_FPN.py
8. TPAMI2025|ultralytics/nn/extra_modules/neck/HyperComputeModule.py
9. ultralytics/nn/extra_modules/neck/SlimNeck.py
10. ultralytics/nn/extra_modules/neck/GoldYOLO.py
11. ultralytics/nn/extra_modules/neck/EMBSFPN.py
- ultralytics/nn/extra_modules/featurefusion
1. 自研模块|ultralytics/nn/extra_modules/featurefusion/cgfm.py
2. BMVC2024|ultralytics/nn/extra_modules/featurefusion/msga.py
3. CVPR2024|ultralytics/nn/extra_modules/featurefusion/mfm.py
4. TIP2023|ultralytics/nn/extra_modules/featurefusion/CSFCN.py
5. BIBM2024|ultralytics/nn/extra_modules/featurefusion/mpca.py
6. ACMMM2024|ultralytics/nn/extra_modules/featurefusion/wfu.py
7. CVPR2025|ultralytics/nn/extra_modules/featurefusion/GDSAFusion.py
8. ultralytics/nn/extra_modules/featurefusion/PST.py
9. TGRS2025|ultralytics/nn/extra_modules/featurefusion/MSAM.py
10. INFFUS2025|ultralytics/nn/extra_modules/featurefusion/DPCF.py
11. CVRP2025|ultralytics/nn/extra_modules/featurefusion/LCA.py
12. TGRS2025|ultralytics/nn/extra_modules/featurefusion/HFFE.py
13. TGRS2025|ultralytics/nn/extra_modules/featurefusion/MFPM.py
14. TGRS2025|ultralytics/nn/extra_modules/featurefusion/ERM.py
15. TIP2025|ultralytics/nn/extra_modules/featurefusion/CAFM.py
16. TIP2024|ultralytics/nn/extra_modules/featurefusion/CGAFusion.py
17. IF2023|ultralytics/nn/extra_modules/featurefusion/PSFM.py
18. IF2023|ultralytics/nn/extra_modules/featurefusion/SDFM.py
19. 自研模块|ultralytics/nn/extra_modules/featurefusion/DAF.py
20. 自研模块|ultralytics/nn/extra_modules/featurefusion/CIDAF.py
21. 自研模块|ultralytics/nn/extra_modules/featurefusion/WDAF.py
- ultralytics/nn/extra_modules/norm
1. ICML2024|engine/extre_module/custom_nn/transformer/repbn.py
2. CVPR2025|engine/extre_module/custom_nn/transformer/dyt.py
3. engine/extre_module/custom_nn/norm/derf.py
- ultralytics/nn/extra_modules/featurepreprocess
1. TGRS2025|ultralytics/nn/extra_modules/featurepreprocess/FAENet.py
- ultralytics/nn/extra_modules/head(ultralytics/cfg/models/improve/head)
1. ultralytics/nn/extra_modules/head/LSPCD.py
## Loss 列表
#### 默认配置(兼容)
- cls_loss=bce
- iou_loss=ciou
- iou_aux=none
- cls_loss(分类损失)
1. bce
2. slide
3. ema_slide
4. focal
5. varifocal
6. qualityfocal
- iou_loss(IoU主损失)
1. 基础形式:
iou、giou、diou、ciou、eiou、siou、shapeiou、piou、piou2
2. Inner形式:
inner_(例如:inner_diou、inner_ciou、inner_siou)
3. Focaler形式:
focaler_(例如:focaler_diou、focaler_ciou、focaler_siou)
4. MPDIoU家族:
mpdiou、inner_mpdiou、focaler_mpdiou
5. WiseIoU家族:
wiseiou(等价wiseiou_wiou)
wiseiou_
wiseiou_inner_
wiseiou_focaler_
6. wise 可选值:
iou、wiou、giou、diou、ciou、eiou、siou、shapeiou、piou、piou2、mpdiou
- iou_aux(IoU辅助损失)
1. none
2. gcd
3. nwd
## 更新公告
- 20260217
1. 初版项目发布.
2. 新增使用教程、模块改进使用教程视频.
- 20260228
1. 新增常见的cls和iou的损失,并直接支持在train.py里面指定,并且在训练的时候会打印目前的loss.
2. 对模型改进的yaml扩展到yolov8、yolov10、yolo11、yolo12.
3. 新增在训练过程中mAP75输出.
4. 优化detect.py中的特征图保存机制,使其可以单独保存每一个通道的特征图和总通道求和的特征图.
5. 新增毕业必备-基于web的可视化界面,支持选择模型、检测图片、检测视频,显示目标数量等等功能
6. 新增web界面的教程视频.
7. 新增注册module的教程视频.
- 20260308
1. 在val.py脚本中增加auto_coco_eval指标,支持一步到位计算COCO指标,不需要再人为转换标签和对齐标签的问题!
2. 新增AAAI2026-SPJFB模块.
3. 新增TGRS2025-GLSS2D模块.
4. 新增TIP2025-CAFM模块.
5. 新增TIP2025-DWM_MSA模块.
6. 新增DynamicERF模块.
7. 新增CSP、MetaFormer、Module在yaml中的使用教程-20260307补充版的视频.
8. 修复用户反馈的bug.
- 20260315
1. 新增CVPR2026-DEGConv模块。
2. 新增CVPR2026-BinaryAttention模块。
3. 新增CVPR2026-TransMixer模块。
4. 新增CVPR2025-wca模块。
5. 新增自研模块-DAF模块。
6. 新增自研模块-CIDAF模块。
7. 新增自研模块-WDAF模块。
8. 新增Neck部分内容(ASF、BIFPN、CTrans、ERepBIFPN、GFPN、HSFPN、HS-FPN、超图FPN、SlimNeck、GoldYOLO、EMBSFPN)。
9. 补全attention部分的配置文件。
10. 新增conv、attention的内容如何与CSP模块随意组合的使用教程。
11. 修复用户反馈的bug。
================================================
FILE: bilibili-guide.md
================================================
# 魔鬼面具-哔哩哔哩视频指南
### 必看干货系列(建议搞深度学习的小伙伴都看看,特别是图像相关)
1. [深度学习常见实验问题与实验技巧(适用于所有模型,小白初学者必看!)](https://www.bilibili.com/video/BV17j41147j8/)
2. [还在迷茫深度学习中的改进实验应该从哪里开始改起的同学,一定要进来看看了!用自身经验给你推荐实验顺序!](https://www.bilibili.com/video/BV1Nu4y1G7B9/)
3. [探究深度学习中预训练权重对改进和精度的影响!](https://www.bilibili.com/video/BV1FH4y1o7GL/)
4. [什么?你说你不会画模型结构图?行吧,那你进来看看吧,手把手教你画YAML结构图!](https://www.bilibili.com/video/BV1X94y1K76Z/)
5. [探究深度学习中训练中的可重现性](https://www.bilibili.com/video/BV1Nu4y1s7sc/)
6. [什么?你说你更换主干后看不懂配置文件也不懂画结构图?那你快点进来看看了!](https://www.bilibili.com/video/BV1WA4m1V7nQ/)
7. [从三个角度分析,什么条件才算是一个合格的改进专栏!](https://www.bilibili.com/video/BV1E6421g7eb/)
8. [都2024了,你写论文不会还只用p,r,map这些指标分析目标检测模型吧?](https://www.bilibili.com/video/BV1wF4m177JQ/)
9. [从简到难手把手教你画Pytorch模块内的结构图!](https://www.bilibili.com/video/BV1dC411p7H7/)
10. [深度学习论文实验中的其中一大注意点-预训练权重究竟加还是不加?](https://www.bilibili.com/video/BV1Q1421Q7Zw/)
11. [深度学习改进实验必看!基于YOLOV8的WIDER-FACE改进(轻量化+提点)实验思路讲解](https://www.bilibili.com/video/BV1QJ4m1H7DJ/)
12. [YOLOV8-硬塞注意力机制?这样做没创新!想知道注意力怎么用才有创新那赶快来看看!](https://www.bilibili.com/video/BV1bm421K7tf/)
13. [YOLOV8改进-还硬塞注意力机制?这期用注意力机制手把手给大家自研一个ContextGuideFPN!创新真的不难,需要找对方法!](https://www.bilibili.com/video/BV1Vx4y1n7hZ/)
14. [长达46分钟的肺腑之言!给以后想从事图像算法工程师、小白入门深度学习路线的总结!](https://www.bilibili.com/video/BV16y411h7T9/)
15. [提升多少才能发paper?轻量化需要看什么指标?需要轻量化到什么程度才能发paper?这期给大家一一解答!](https://www.bilibili.com/video/BV1QZ421M7gu/)
16. [深度学习实验部分常见疑问解答!(小白刚入门必看!少走弯路!少自我内耗!)](https://www.bilibili.com/video/BV1Bz421B7pC/)
```
1. 如何衡量自己的所做的工作量够不够?
2. 为什么别人的论文说这个模块对xxx有作用,但是我自己用的时候还掉点了?
3. 提升是和什么模型相比呢 比如和yolov8这种基础模型比还是和别人提出的目前最好的模型比
4. 对比不同的模型的时候,输入尺寸,学习率,学习次数这些是否需要一致?
```
17. [深度学习实验部分常见疑问解答二!(小白刚入门必看!少走弯路!少自我内耗!)](https://www.bilibili.com/video/BV1ZM4m1m785/)
```
1. 为什么我用yolov8自带的coco8、coco128训练出来的效果很差?
2. 我的数据集很大,机器跑得慢,我是否可以用数据集的百分之10的数据去测试这个改进点是否有效?有效再跑整个数据集?
```
18. [深度学习实验部分常见疑问解答三!(怎么判断模型是否收敛?模型过拟合怎么办?)](https://www.bilibili.com/video/BV11S421d76P/)
19. [YOLO系列模型训练结果详细解答!(训练过程的一些疑问,该放哪个文件运行出来的结果、参数量计算量在哪里看..等等问题)](https://www.bilibili.com/video/BV11b421J7Vx/)
20. [细谈目标检测中的小目标检测头和大目标检测检测头,并教懂你怎么加微小目标、极大目标检测头!](https://www.bilibili.com/video/BV1jkDWYFEwx/)
21. [深度学习炼丹必备必看必须知道的小技巧!](https://www.bilibili.com/video/BV1q3SZYsExc/)
22. [深度学习实验准备-数据集怎么选?有哪些需要注意的点?](https://www.bilibili.com/video/BV11zySYvEhs/)
23. [深度学习论文实验中新手非常容易陷入的一个误区:抱着解决xxx问题的心态去做实验](https://www.bilibili.com/video/BV1kkkvYJEHG/)
24. [小目标检测必看系列 | 除了AP-Small指标,可还有AP-VeryTiny、AP-Tiny的指标喔~手把手带你加!](https://www.bilibili.com/video/BV1CYcUeBEzY/)
25. [YOLO中的实例分割原来是这样巧妙地实现的!你在做YOLO-Seg但是又不知道的话,那你要进来看看咯~](https://www.bilibili.com/video/BV1SkP1e1EHC/)
26. [长达30分钟的吐血讲解!为什么别人的纯YOLO小目标检测能上AAAI2025,你的连个最差的都费劲!看看差距在哪里,怎么改善!](https://www.bilibili.com/video/BV14DJazTEtV)
27. [深度学习论文中的基础实验、改进实验、 消融实验、对比实验、泛化实验|这些究竟是什么?](https://www.bilibili.com/video/BV1NYKUz2E6b/)
28. [深度学习论文中的推理结果图、热力图、特征图究竟应该怎么放?需要注意什么?有什么作用?](https://www.bilibili.com/video/BV1s5gQzcEPh/)
29. [YOLO|RTDETR|我会跑Ultralytics了!但是输出的这些都怎么看呀?论文中的结果写什么呀?需要注意什么呀?](https://www.bilibili.com/video/BV1VfbVzHEGM/)
### 服务器租用系列
1. [|DAModel|竟然有一个"不需要装环境就能跑YOLO代码"的服务器平台?让我们一起来看看!](https://www.bilibili.com/video/BV1mg2SYGEGF)
2. [|DAModel|给大家准备好COCO、VOC、VisDrone、CrowdHuman、BDD100K数据集啦~YOLO格式和data.yaml都已配置好~](https://www.bilibili.com/video/BV1UV5qzuEGf)
3. [智算云扉服务器平台|0.99每小时的3090?RTX4090-48GB的显卡?已经配置好的YOLO|RTDETR环境?充值还有额外算力点?标题有限制优势说不完。](https://www.bilibili.com/video/BV11DXTYiENS)
### 必看论文分享系列
1. [有营养的必看论文分享系列一-RTMDet<考虑到精度、速度、部署的2D目标检测网络>](https://www.bilibili.com/video/BV1ab421J77G/)
2. [有营养的必看论文分享系列二-MobileNets<轻量化的开山之作>](https://www.bilibili.com/video/BV1hM4m117JW/)
3. [计算机视觉|YOLO|DETR|2025创新必看的论文之一|MetaFormer(TPAMI2024),选对Baseline是成功的第一步](https://www.bilibili.com/video/BV1W5ATetEg6/)
### 高区论文带读系列
1. [高区论文带读系列一-40分钟长视频带你分析一篇SCI1区的文章,SCI1区也不是触不可及!](https://www.bilibili.com/video/BV1JESuYxEjn/)
2. [高区论文带读系列二-学会捕捉数据集场景下的要害问题是写好文章的第一步!](https://www.bilibili.com/video/BV1XNqjYNEyg/)
### YOLO系列配置文件系列
1. [不会把多个改进整合到一个yaml配置文件里面?那来看看这个吧!从简到难手把手带你整合三个yaml](https://www.bilibili.com/video/BV15H4y1Y7a2/)
2. [细谈目标检测中的小目标检测头和大目标检测检测头,并教懂你怎么加微小目标、极大目标检测头!](https://www.bilibili.com/video/BV1jkDWYFEwx/)
3. [不会看YOLO的模型yaml配置文件?那你还怎么整合多个配置文件!](https://www.bilibili.com/video/BV1oiBRYnEEw/)
4. [不会把多个创新点整合到一个yaml配置文件里面?那来看看这个吧!手把手来你整合创新点!](https://www.bilibili.com/video/BV1DUBRYGE3b/)
### YOLOV5,V7-PYQT5项目讲解
1. [哔哩哔哩合集地址](https://space.bilibili.com/286900343/channel/collectiondetail?sid=917275)
2. [项目github地址](https://github.com/z1069614715/yolov7-pyqt)
### YOLOV5、V7、V8、V9、V10、V11、V12 热力图源码
1. [哔哩哔哩合集地址](https://space.bilibili.com/286900343/channel/collectiondetail?sid=1080305)
2. [项目github地址](https://github.com/z1069614715/objectdetection_script/blob/master/yolo-gradcam)
### YOLO系列模型使用教程系列
1. [YOLOV7保姆级教程](https://www.bilibili.com/video/BV1gD4y1s7zw/?spm_id_from=333.999.0.0)
2. [YOLOV5-Seg实例分割教程](https://www.bilibili.com/video/BV1nV4y1P7HQ/?spm_id_from=333.999.0.0)
3. [YOLOV5-快速上手教程](https://www.bilibili.com/video/BV1tM411a7it/?spm_id_from=333.999.0.0)
4. [YOLOV8-OBB详细教学视频(包含如何把DOTA数据集分割成小图进行训练)](https://www.bilibili.com/video/BV1xK4y117fg/)
5. [EfficientTeacher半监督-详细教学和调参注意事项](https://www.bilibili.com/video/BV1494y1v7hF/)
6. [YOLOV9保姆级别教程来啦~包含环境配置、数据集转换、训练、测试、推理环节~一看就懂!](https://www.bilibili.com/video/BV1d1421z7XW/)
7. [保姆级别YOLOV11-环境配置、 数据集介绍、训练、验证、推理 详细教学视频,看了它,跑YOLOV11 没问题~](https://www.bilibili.com/video/BV1VA11YBELB/)
### YOLOV8V11源码常见疑问解答小课堂
1. [关于配置文件中Optimizer参数为auto的时候,究竟Optimizer会怎么选用呢?](https://www.bilibili.com/video/BV1K34y1w7cZ/)
2. [best.pt究竟是根据什么指标来保存的?](https://www.bilibili.com/video/BV1jN411M7MA/)
3. [数据增强在yolov8中的应用](https://www.bilibili.com/video/BV1aQ4y1g7ah/)
4. [如何添加FPS计算代码和FPS的相关的一些疑问](https://www.bilibili.com/video/BV1Sw411g7DD/)
5. [预测框粗细颜色修改与精度小数位修改](https://www.bilibili.com/video/BV12K421a7rH/)
6. [导出改进/剪枝的onnx模型和讲解onnx-opset和onnxsim的作用](https://www.bilibili.com/video/BV1CK421e7Y3/)
7. [YOLOV8模型详细讲解(包含该如何改进YOLOV8)(刚入门小白,需要改进YOLOV8的同学必看!)](https://www.bilibili.com/video/BV1Ms421u7VH/)
8. [学习率变化问题](https://www.bilibili.com/video/BV1frnferEL1/)
### 目标检测干活系列
1. [深入了解目标检测中的检测头](https://www.bilibili.com/video/BV1AQ4y1j7Cr/)
2. [目标检测中的标签分配策略做了什么?分配过程中的正负样本又是什么?](https://www.bilibili.com/video/BV1Ek4aeUE2J/)
### 环境配置系列教程
1. [保姆式AUTODL-YOLO环境教程(上):从0教你如何配置VSCODE、安装新环境和CUDA和CUDNN、跑通YOLOV8、编译DCNV3](https://www.bilibili.com/video/BV1tT4y1b75q/)
2. [保姆式AUTODL-YOLO环境教程(下):从0教你如何配置VSCODE、安装新环境和CUDA和CUDNN、跑通YOLOV8、编译DCNV3](https://www.bilibili.com/video/BV1nV411Q7mA/)
### 目标检测Tricks
1. [可视化并统计目标检测中的TP,FP,FN](https://www.bilibili.com/video/BV1yM4y1d7Gp/)
2. [深度学习小实验-卷积家族(fps,flops,param)对比实验](https://www.bilibili.com/video/BV1UL411R7Qr/)
3. [yolov5中的FeatureMap可视化(热力图格式)](https://www.bilibili.com/video/BV1LV4y1R7w6/)
4. [用于yolov5和v7中的yolo格式转换coco格式的脚本.](https://www.bilibili.com/video/BV14T411s7Ts/)
5. [Segment Anything演示代码](https://www.bilibili.com/video/BV1hv4y1H7eg/)
6. [固定随机种子在同一个主机上极可能地复现结果](https://www.bilibili.com/video/BV1bh4y1n7Yc/)
7. [计算yolov5推理时间和FPS的脚本](https://www.bilibili.com/video/BV1Uu4y1C714/)
8. [计算yolov7推理时间和FPS的脚本](https://www.bilibili.com/video/BV17p4y177Pe/)
9. [深度学习小实验-YOLO-Block家族(fps,flops,param)对比实验.](https://www.bilibili.com/video/BV17H4y1V7s9/)
10. [输出YOLOV8、RTDETR各个层的计算量和参数量.](https://www.bilibili.com/video/BV1tb421b7aB/)
11. [YOLOV8-不会把PR曲线的数据保存并绘制到一张图?不用怕,手把手教程来啦~](https://www.bilibili.com/video/BV1uC41177oE/)
12. [yolov5、v7、v8、v9、v10曲线对比图、推理时间vs精度对比图绘制手把手教程!](https://www.bilibili.com/video/BV1yf421X7t5/)
13. [YOLOV8-输出每一层的图特征图尺寸和通道数.](https://www.bilibili.com/video/BV1Mz421B7xz/)
14. [YOLOV8V10V11V12更详细的输出精度结果](https://www.bilibili.com/video/BV1dBQDY6Ec5/)
15. [关于数据集的可视化脚本](https://www.bilibili.com/video/BV1k2TizGEnH/)
### MMDet系列教程
1. [一库打尽目标检测对比实验!mmdetection环境、训练、测试手把手教程!](https://www.bilibili.com/video/BV1xA4m1c7H8/)
2. [一库打尽目标检测对比实验!mmdetection参数量、计算量、FPS、绘制logs手把手教程](https://www.bilibili.com/video/BV17C41137dW/)
3. [一库打尽目标检测对比实验!mmdetection指标转换YOLO指标!](https://www.bilibili.com/video/BV1AWtCesEc6/)
### 离线数据增强教程
1. [目标检测数据集离线数据增强教程,包含对目标框、多种变换、天气变化等等增强!](https://www.bilibili.com/video/BV1bT421k7iq/)
2. [语义分割数据集离线数据增强教程,包含对mask、多种变换、天气变化等等增强!](https://www.bilibili.com/video/BV1xi421a7Gb/)
3. [CVPR2025-SaMam|手把手带你用以Mamba为核心的任意风格迁移网络去做数据集扩充!(一个小创新点有了!)](https://www.bilibili.com/video/BV1gWE4z4Eqq/)
### YOLO系列(YOLOV5,YOLOV7,YOLOV8)模型改进大合集
#### YOLOV5(主干系列修改V7同样也适用)
1. [添加EIOU,SIOU,ALPHA-IOU, FocalEIOU到yolov5的box_iou中](https://www.bilibili.com/video/BV1KM411b7Sz/)
2. [Wise-IoU](https://www.bilibili.com/video/BV1tG4y1N7Gk/)
3. [使用DAMO-YOLO中的GFPN替换YOLOV5中的Head](https://www.bilibili.com/video/BV1iR4y1a7bx/)
4. [使用DAMO-YOLO中的GFPN替换YOLOV5中的Head](https://www.bilibili.com/video/BV1iR4y1a7bx/)
5. [使用yolov8中的C2F模块替换yolov5中的C3模块.](https://www.bilibili.com/video/BV1rx4y1g7xt/)
6. [添加Optimal Transport Assignment到yolov5的Loss中](https://www.bilibili.com/video/BV1xD4y1J76n/)
7. [添加Deformable convolution V2到yolov5中](https://www.bilibili.com/video/BV1rT411Q76q/)
8. [添加辅助训练分支到yolov5中](https://www.bilibili.com/video/BV1Fo4y1v7bi/)
9. [添加context augmentation module到yolov5中](https://www.bilibili.com/video/BV17b411d7ef/)
10. [添加SAC到yolov5中](https://www.bilibili.com/video/BV1xD4y1u7NU/)
11. [添加CoordConv到yolov5中](https://www.bilibili.com/video/BV1ng4y1E7rS/)
12. [添加soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU)到yolov5中](https://www.bilibili.com/video/BV1cM41147Ry/)
13. [添加DSConv到yolov5中](https://www.bilibili.com/video/BV1iT411a7Mi/)
14. [添加DCNV3到yolov5中.](https://www.bilibili.com/video/BV1LY411z7iE/)
15. [添加Normalized Gaussian Wasserstein Distance到yolov5中.](https://www.bilibili.com/video/BV1zY4y197UP/)
16. [添加Efficient-DecoupledHead到yolov5中](https://www.bilibili.com/video/BV1mk4y1h7us/)
17. [添加FasterNet中的Faster-Block到yolov5中](https://www.bilibili.com/video/BV1Bs4y1H7Ph/)
18. [添加Timm支持的主干到yolov5中.](https://www.bilibili.com/video/BV1Mx4y1A7jy/)
19. [添加Task-Specific Context Decoupling到yolov5中](https://www.bilibili.com/video/BV1mk4y1h7us/)
20. [添加FasterNet主干到yolov5中](https://www.bilibili.com/video/BV1ra4y1K77u/)
21. [添加Omni-Dimensional Dynamic Convolution主干(od_mobilenetv2,od_resnet)到yolov5中](https://www.bilibili.com/video/BV1Jk4y1v7EW/)
22. [融合Omni-Dimensional Dynamic Convolution主干(od_mobilenetv2,od_resnet)中的Conv和BN](https://www.bilibili.com/video/BV1Rs4y1N7fp/)
23. [添加轻量级上采样算子CARAFE到yolov5中](https://www.bilibili.com/video/BV1kj411c72a/)
24. [添加CFPNet中的EVC-Block到yolov5中](https://www.bilibili.com/video/BV1Pg4y1u7cM/)
25. [添加基于注意力机制的目标检测头(DYHEAD)到yolov5中](https://www.bilibili.com/video/BV1qs4y117Mx/)
26. [添加(2023年New)InceptionNeXt主干到yolov5中](https://www.bilibili.com/video/BV12v4y1H7E1/)
27. [添加aLRPLoss到yolov5中](https://www.bilibili.com/video/BV1YV4y1Z7rV/)
28. [结合Res2Net提出具有多尺度提取能力的C3模块](https://www.bilibili.com/video/BV13X4y167VB/)
29. [添加(2022年)FocalNet(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1ch411L7Dk/)
30. [添加(2023年)EMO(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1Dh4y1J7SV/)
31. [添加(2022年)EfficientFormerV2(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1da4y1g7KT/)
32. [添加(2022年CVPR)PoolFormer(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1eh411c7bz/)
33. [添加(2023年)EfficientViT(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1xk4y1L7Gu/)
34. [添加ContextAggregation到yolov5中](https://www.bilibili.com/video/BV1Yk4y1s7Kx/)
35. [添加(2023年)VanillaNet主干到yolov5中](https://www.bilibili.com/video/BV1os4y1v7Du/)
36. [添加(2022年)NextViT主干到yolov5中](https://www.bilibili.com/video/BV1im4y1i7Ht/)
37. [添加(2023年)RIFormer主干到yolov5中](https://www.bilibili.com/video/BV1bW4y1X7Lo/)
38. [Scale-Aware RFE与C3结合而成的C3RFEM添加到yolov5中](https://www.bilibili.com/video/BV1Gj411D7Pf/)
39. [把重参数结构DiverseBranchBlock与C3融合成C3-DBB添加到yolov5中](https://www.bilibili.com/video/BV1sM4y177Cn/)
40. [添加(2023CVPR)EfficientViT(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1xk4y1L7Gu/)
41. [添加(2023旋转目标检测SOTA)LSKNet主干到yolov5中](https://www.bilibili.com/video/BV1xk4y1L7Gu/)
42. [添加(2023最新IoU度量算法)MPDiou到yolov5中.](https://www.bilibili.com/video/BV19P41147gJ/)
43. [添加Yolo-Face-V2中SlideLoss的到yolov5中](https://www.bilibili.com/video/BV1W14y1i79U/)
44. [添加RepViT(transformer)主干到yolov5中](https://www.bilibili.com/video/BV1PH4y1S7mf/)
45. [利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进YOLOV5中的特征融合模](https://www.bilibili.com/video/BV1PH4y1S7mf/)
46. [利用动态蛇形卷积改进YOLOV5](https://www.bilibili.com/video/BV1Qu411K7Hw/)
47. [利用带有位置信息编码的AIFI自注意力机制改进YOLOV5](https://www.bilibili.com/video/BV1nu4y1h7eS/)
48. [添加UniRepLKNet主干到yolov5中](https://www.bilibili.com/video/BV1PH4y1S7mf/)
49. [添加Attentional Scale Sequence Fusion到yolov5中](https://www.bilibili.com/video/BV1PH4y1S7mf/)
50. [添加cross-scale feature-fusion到yolov5中](https://www.bilibili.com/video/BV1Tb4y1P7yd/)
51. [添加对小目标有效的BiFormer注意力机制到yolov5中](https://www.bilibili.com/video/BV15g4y1g7bM/)
52. [引入最新SOTA(YOLOV9)中的RepNCSPELAN模块](https://www.bilibili.com/video/BV17y421z73k/)
#### YOLOV7
1. [添加EIOU,SIOU,ALPHA-IOU, FocalEIOU到yolov5的box_iou中](https://www.bilibili.com/video/BV1zx4y177EF/)
2. [Wise-IoU](https://www.bilibili.com/video/BV1yv4y147kf/)
3. [添加Deformable convolution V2到yolov7中](https://www.bilibili.com/video/BV17R4y1q7vr/)
4. [添加SAC到yolov7中](https://www.bilibili.com/video/BV1xD4y1u7NU/)
5. [添加CoordConv到yolov7中](https://www.bilibili.com/video/BV1K54y1g7ye/)
6. [添加soft-nms(IoU,GIoU,DIoU,CIoU,EIoU,SIoU)到yolov7中](https://www.bilibili.com/video/BV1ZY41167iC/)
7. [添加DSConv到yolov7中](https://www.bilibili.com/video/BV1724y1b7PD/)
8. [添加DCNV3到yolov7中.](https://www.bilibili.com/video/BV1mk4y1h7us/)
9. [添加Normalized Gaussian Wasserstein Distance到yolov7中](https://www.bilibili.com/video/BV1kM411H7g1/)
10. [添加具有隐式知识学习的Efficient-DecoupledHead到yolov7中](https://www.bilibili.com/video/BV1tg4y1x7ha/)
11. [添加FasterNet中的PConv到yolov7中](https://www.bilibili.com/video/BV1Z84y137oi/)
12. [添加轻量级上采样算子CARAFE到yolov7中.](https://www.bilibili.com/video/BV1yc411p7wL/)
13. [添加基于注意力机制的目标检测头(DYHEAD)到yolov7中](https://www.bilibili.com/video/BV1Ph4y1s7i9/)
14. [添加Omni-Dimensional Dynamic Convolution到yolov7中](https://www.bilibili.com/video/BV1vh411j71Z/)
15. [添加CFPNet中的EVC-Block到yolov7中](https://www.bilibili.com/video/BV12u4y1f7np/)
16. [P2,P6检测层在YOLOV7中的添加](https://www.bilibili.com/video/BV1LX4y1a72m/)
17. [使用VOVGSCSP轻量化yolov7的Neck](https://www.bilibili.com/video/BV14m4y147PC/)
18. [添加SwinTransformer-Tiny主干到yolov5中](https://www.bilibili.com/video/BV1WX4y1a7ea/)
19. [Scale-Aware RFE添加到yolov7中](https://www.bilibili.com/video/BV1hW4y1D7gQ/)
20. [把重参数结构DiverseBranchBlock添加到yolov7中](https://www.bilibili.com/video/BV14u411b7kL/)
21. [添加(2023最新IoU度量算法)MPDiou到yolov7中](https://www.bilibili.com/video/BV1Qh4y1r7D3/)
22. [利用华为2023最新GOLD-YOLO中的Gatherand-Distribute进行改进YOLOV7中的特征融合模块.](https://www.bilibili.com/video/BV14V411c7H1/)
23. [利用动态蛇形卷积改进YOLOV7](https://www.bilibili.com/video/BV1Wj411x7fq/)
24. [利用带有位置信息编码的AIFI自注意力机制改进YOLOV7](https://www.bilibili.com/video/BV1rj411a7s4/)
25. [添加Attentional Scale Sequence Fusion到yolov7中](https://www.bilibili.com/video/BV1PH4y1S7mf/)
26. [引入最新SOTA(YOLOV9)中的RepNCSPELAN模块](https://www.bilibili.com/video/BV1UA4m137hz/)
#### YOLOV8
1. [添加EIOU,SIOU,ALPHA-IOU, FocalEIOU到yolov5,yolov8的box_iou中](https://www.bilibili.com/video/BV1PY4y1o7Hm/)
2. [Wise-IoU](https://www.bilibili.com/video/BV1De4y1N7Mb/)
3. [添加Deformable convolution V2到yolov8中](https://www.bilibili.com/video/BV1Fo4y1i7Mm/)
4. [最新~YOLOV8手把手教学配置文件添加注意力机制!一看就会!](https://www.bilibili.com/video/BV1RH4y1D7CY/)
5. [YOLOV8改进-手把手带你学会注意力机制进阶用法](https://www.bilibili.com/video/BV1ZQ4y1J7oC/)
6. [YOLOV8可视化-可视化并统计每张图的True Positive、False Positive、False Negative](https://www.bilibili.com/video/BV1RA4m1L79K/)
7. [YOLOV8-基于VisDrone的TaskAlignedAssigner任务对齐分配策略的调参实验](https://www.bilibili.com/video/BV1XJ4m1x7eJ/)
8. [YOLOV8-不会把多个改进整合到一个yaml配置文件里面?那来看看这个吧!从简到难手把手带你整合三个yaml](https://www.bilibili.com/video/BV15H4y1Y7a2/)
9. [YOLOV8下游任务系列-一步一步DEBUG保姆式带你完成目标计数](https://www.bilibili.com/video/BV17H4y1J7DD/)
10. [YOLOV8改进-带你分析V8的检测头并重设计10种结构轻量化检测头](https://www.bilibili.com/video/BV1cu411K7FE/)
11. [从CVPR2022-RepLKNet分析有效感受野,并提供YOLOV8可视化感受野的脚本和讲解~](https://www.bilibili.com/video/BV1Gx4y1v7ZZ/)
12. [YOLOV8-不会把PR曲线的数据保存并绘制到一张图?不用怕,手把手教程来啦~](https://www.bilibili.com/video/BV1uC41177oE/)
13. [YOLOV8应用NMS-Free效果怎么样?在Visdrone2019数据集上进行实验,效果不错!后处理时间为0.0ms!](https://www.bilibili.com/video/BV1bt421N7ob/)
14. [YOLOV8-NMSFree|更多公开数据集测试!VisDrone、VOC、PCB](https://www.bilibili.com/video/BV1nZ421x7jr/)
15. [YOLOV8模型详细讲解(包含该如何改进YOLOV8)(刚入门小白,需要改进YOLOV8的同学必看!)](https://www.bilibili.com/video/BV1Ms421u7VH/)
#### YOLOV9
1. [YOLOV9-VisDrone实验对比结果来啦!YOLOV9-C模型VisDrone测试集精度为39.7!有兴趣进来看看具体啦!](https://www.bilibili.com/video/BV1Yy42187A3/)
2. [从源码分析YOLOV9比YOLOV7多了什么内容!](https://www.bilibili.com/video/BV1v1421f7rN/)
3. [YOLOV9n VS YOLOV8n,在VisDrone数据集上精度有2.4个点的提升!](https://www.bilibili.com/video/BV16m411f78L/)
4. [YOLOV9改进-更换轻量化王者MobilenetV4-Backbone](https://www.bilibili.com/video/BV1Ax4y1B7Ln/)
5. [YOLOV9改进-CVPR2024-StarNet、DRepCSPELAN](https://www.bilibili.com/video/BV1BU411o7rz/)
6. [YOLOV9改进-CVPR2023-FasterNet以及其FasterBlock、PConv的改进](https://www.bilibili.com/video/BV18y411a74y/)
7. [YOLOV9改进-DySnakeConv动态蛇形卷积、针对长条形不规则物体!](https://www.bilibili.com/video/BV1gi421S77X/)
#### YOLOV11
1. [Ultralytics8.3.0沉浸式讲解-YOLOV11针对代码的详细剖析](https://www.bilibili.com/video/BV19XxxeXEma/)
2. [保姆级别YOLOV11-环境配置、 数据集介绍、训练、验证、推理 详细教学视频,看了它,跑YOLOV11 没问题~](https://www.bilibili.com/video/BV1VA11YBELB/)
3. [YOLOV11改进详细分析(改进前必看),每个部分(Backbone、Neck、Head....)有哪些地方可以改进?改进的时候要避免小白三件套!](https://www.bilibili.com/video/BV1GKCdYbEuz/)
#### YOLOV13
1. [哎哟你干嘛!YOLO又又又又出新版本了,YOLOV13来了!我们来看看YOLOV13改进了什么,对正在做YOLO改进的同学有什么影响?](https://www.bilibili.com/video/BV1jqKbzGEua/)
#### D-Fine-ICLR2025
1. [暴打CVPR2024-RTDETR的D-Fine究竟性能如何?我们一起来训练看看~](https://www.bilibili.com/video/BV1aE6aYHEer/)
#### DEIM-CVPR2025
1. [CVPR2025-DEIM|新一代目标检测SOTA|2025发高区论文必备的baseline|训练、测试、10几集的基础改进课程、画图教程系列](https://space.bilibili.com/286900343/lists/4909499)
================================================
FILE: cv-attention/A2Attention.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn import functional as F
class DoubleAttention(nn.Module):
def __init__(self, in_channels,c_m=128,c_n=128,reconstruct = True):
super().__init__()
self.in_channels=in_channels
self.reconstruct = reconstruct
self.c_m=c_m
self.c_n=c_n
self.convA=nn.Conv2d(in_channels,c_m,1)
self.convB=nn.Conv2d(in_channels,c_n,1)
self.convV=nn.Conv2d(in_channels,c_n,1)
if self.reconstruct:
self.conv_reconstruct = nn.Conv2d(c_m, in_channels, kernel_size = 1)
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def forward(self, x):
b, c, h,w=x.shape
assert c==self.in_channels
A=self.convA(x) #b,c_m,h,w
B=self.convB(x) #b,c_n,h,w
V=self.convV(x) #b,c_n,h,w
tmpA=A.view(b,self.c_m,-1)
attention_maps=F.softmax(B.view(b,self.c_n,-1))
attention_vectors=F.softmax(V.view(b,self.c_n,-1))
# step 1: feature gating
global_descriptors=torch.bmm(tmpA,attention_maps.permute(0,2,1)) #b.c_m,c_n
# step 2: feature distribution
tmpZ = global_descriptors.matmul(attention_vectors) #b,c_m,h*w
tmpZ=tmpZ.view(b,self.c_m,h,w) #b,c_m,h,w
if self.reconstruct:
tmpZ=self.conv_reconstruct(tmpZ)
return tmpZ
if __name__ == '__main__':
input=torch.randn(50,512,7,7)
a2 = DoubleAttention(512)
output=a2(input)
print(output.shape)
================================================
FILE: cv-attention/BAM.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
def autopad(k, p=None, d=1): # kernel, padding, dilation
"""Pad to 'same' shape outputs."""
if d > 1:
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Flatten(nn.Module):
def forward(self, x):
return x.view(x.shape[0], -1)
class ChannelAttention(nn.Module):
def __init__(self, channel, reduction=16, num_layers=3):
super().__init__()
self.avgpool = nn.AdaptiveAvgPool2d(1)
gate_channels = [channel]
gate_channels += [channel // reduction] * num_layers
gate_channels += [channel]
self.ca = nn.Sequential()
self.ca.add_module('flatten', Flatten())
for i in range(len(gate_channels) - 2):
self.ca.add_module('fc%d' % i, nn.Linear(gate_channels[i], gate_channels[i + 1]))
self.ca.add_module('bn%d' % i, nn.BatchNorm1d(gate_channels[i + 1]))
self.ca.add_module('relu%d' % i, nn.ReLU())
self.ca.add_module('last_fc', nn.Linear(gate_channels[-2], gate_channels[-1]))
def forward(self, x):
res = self.avgpool(x)
res = self.ca(res)
res = res.unsqueeze(-1).unsqueeze(-1).expand_as(x)
return res
class SpatialAttention(nn.Module):
def __init__(self, channel, reduction=16, num_layers=3, dia_val=2):
super().__init__()
self.sa = nn.Sequential()
self.sa.add_module('conv_reduce1',
nn.Conv2d(kernel_size=1, in_channels=channel, out_channels=channel // reduction))
self.sa.add_module('bn_reduce1', nn.BatchNorm2d(channel // reduction))
self.sa.add_module('relu_reduce1', nn.ReLU())
for i in range(num_layers):
self.sa.add_module('conv_%d' % i, nn.Conv2d(kernel_size=3, in_channels=channel // reduction,
out_channels=channel // reduction, padding=autopad(3, None, dia_val), dilation=dia_val))
self.sa.add_module('bn_%d' % i, nn.BatchNorm2d(channel // reduction))
self.sa.add_module('relu_%d' % i, nn.ReLU())
self.sa.add_module('last_conv', nn.Conv2d(channel // reduction, 1, kernel_size=1))
def forward(self, x):
res = self.sa(x)
res = res.expand_as(x)
return res
class BAMBlock(nn.Module):
def __init__(self, channel=512, reduction=16, dia_val=2):
super().__init__()
self.ca = ChannelAttention(channel=channel, reduction=reduction)
self.sa = SpatialAttention(channel=channel, reduction=reduction, dia_val=dia_val)
self.sigmoid = nn.Sigmoid()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def forward(self, x):
b, c, _, _ = x.size()
sa_out = self.sa(x)
ca_out = self.ca(x)
weight = self.sigmoid(sa_out + ca_out)
out = (1 + weight) * x
return out
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
bam = BAMBlock(channel=512, reduction=16, dia_val=2)
output = bam(input)
print(output.shape)
================================================
FILE: cv-attention/Biformer.py
================================================
"""
Core of BiFormer, Bi-Level Routing Attention.
To be refactored.
author: ZHU Lei
github: https://github.com/rayleizhu
email: ray.leizhu@outlook.com
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
from typing import Tuple, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from torch import Tensor, LongTensor
class TopkRouting(nn.Module):
"""
differentiable topk routing with scaling
Args:
qk_dim: int, feature dimension of query and key
topk: int, the 'topk'
qk_scale: int or None, temperature (multiply) of softmax activation
with_param: bool, wether inorporate learnable params in routing unit
diff_routing: bool, wether make routing differentiable
soft_routing: bool, wether make output value multiplied by routing weights
"""
def __init__(self, qk_dim, topk=4, qk_scale=None, param_routing=False, diff_routing=False):
super().__init__()
self.topk = topk
self.qk_dim = qk_dim
self.scale = qk_scale or qk_dim ** -0.5
self.diff_routing = diff_routing
# TODO: norm layer before/after linear?
self.emb = nn.Linear(qk_dim, qk_dim) if param_routing else nn.Identity()
# routing activation
self.routing_act = nn.Softmax(dim=-1)
def forward(self, query:Tensor, key:Tensor)->Tuple[Tensor]:
"""
Args:
q, k: (n, p^2, c) tensor
Return:
r_weight, topk_index: (n, p^2, topk) tensor
"""
if not self.diff_routing:
query, key = query.detach(), key.detach()
query_hat, key_hat = self.emb(query), self.emb(key) # per-window pooling -> (n, p^2, c)
attn_logit = (query_hat*self.scale) @ key_hat.transpose(-2, -1) # (n, p^2, p^2)
topk_attn_logit, topk_index = torch.topk(attn_logit, k=self.topk, dim=-1) # (n, p^2, k), (n, p^2, k)
r_weight = self.routing_act(topk_attn_logit) # (n, p^2, k)
return r_weight, topk_index
class KVGather(nn.Module):
def __init__(self, mul_weight='none'):
super().__init__()
assert mul_weight in ['none', 'soft', 'hard']
self.mul_weight = mul_weight
def forward(self, r_idx:Tensor, r_weight:Tensor, kv:Tensor):
"""
r_idx: (n, p^2, topk) tensor
r_weight: (n, p^2, topk) tensor
kv: (n, p^2, w^2, c_kq+c_v)
Return:
(n, p^2, topk, w^2, c_kq+c_v) tensor
"""
# select kv according to routing index
n, p2, w2, c_kv = kv.size()
topk = r_idx.size(-1)
# print(r_idx.size(), r_weight.size())
# FIXME: gather consumes much memory (topk times redundancy), write cuda kernel?
topk_kv = torch.gather(kv.view(n, 1, p2, w2, c_kv).expand(-1, p2, -1, -1, -1), # (n, p^2, p^2, w^2, c_kv) without mem cpy
dim=2,
index=r_idx.view(n, p2, topk, 1, 1).expand(-1, -1, -1, w2, c_kv) # (n, p^2, k, w^2, c_kv)
)
if self.mul_weight == 'soft':
topk_kv = r_weight.view(n, p2, topk, 1, 1) * topk_kv # (n, p^2, k, w^2, c_kv)
elif self.mul_weight == 'hard':
raise NotImplementedError('differentiable hard routing TBA')
# else: #'none'
# topk_kv = topk_kv # do nothing
return topk_kv
class QKVLinear(nn.Module):
def __init__(self, dim, qk_dim, bias=True):
super().__init__()
self.dim = dim
self.qk_dim = qk_dim
self.qkv = nn.Linear(dim, qk_dim + qk_dim + dim, bias=bias)
def forward(self, x):
q, kv = self.qkv(x).split([self.qk_dim, self.qk_dim+self.dim], dim=-1)
return q, kv
# q, k, v = self.qkv(x).split([self.qk_dim, self.qk_dim, self.dim], dim=-1)
# return q, k, v
class BiLevelRoutingAttention(nn.Module):
"""
n_win: number of windows in one side (so the actual number of windows is n_win*n_win)
kv_per_win: for kv_downsample_mode='ada_xxxpool' only, number of key/values per window. Similar to n_win, the actual number is kv_per_win*kv_per_win.
topk: topk for window filtering
param_attention: 'qkvo'-linear for q,k,v and o, 'none': param free attention
param_routing: extra linear for routing
diff_routing: wether to set routing differentiable
soft_routing: wether to multiply soft routing weights
"""
def __init__(self, dim, n_win=7, num_heads=8, qk_dim=None, qk_scale=None,
kv_per_win=4, kv_downsample_ratio=4, kv_downsample_kernel=None, kv_downsample_mode='identity',
topk=4, param_attention="qkvo", param_routing=False, diff_routing=False, soft_routing=False, side_dwconv=3,
auto_pad=True):
super().__init__()
# local attention setting
self.dim = dim
self.n_win = n_win # Wh, Ww
self.num_heads = num_heads
self.qk_dim = qk_dim or dim
assert self.qk_dim % num_heads == 0 and self.dim % num_heads==0, 'qk_dim and dim must be divisible by num_heads!'
self.scale = qk_scale or self.qk_dim ** -0.5
################side_dwconv (i.e. LCE in ShuntedTransformer)###########
self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \
lambda x: torch.zeros_like(x)
################ global routing setting #################
self.topk = topk
self.param_routing = param_routing
self.diff_routing = diff_routing
self.soft_routing = soft_routing
# router
assert not (self.param_routing and not self.diff_routing) # cannot be with_param=True and diff_routing=False
self.router = TopkRouting(qk_dim=self.qk_dim,
qk_scale=self.scale,
topk=self.topk,
diff_routing=self.diff_routing,
param_routing=self.param_routing)
if self.soft_routing: # soft routing, always diffrentiable (if no detach)
mul_weight = 'soft'
elif self.diff_routing: # hard differentiable routing
mul_weight = 'hard'
else: # hard non-differentiable routing
mul_weight = 'none'
self.kv_gather = KVGather(mul_weight=mul_weight)
# qkv mapping (shared by both global routing and local attention)
self.param_attention = param_attention
if self.param_attention == 'qkvo':
self.qkv = QKVLinear(self.dim, self.qk_dim)
self.wo = nn.Linear(dim, dim)
elif self.param_attention == 'qkv':
self.qkv = QKVLinear(self.dim, self.qk_dim)
self.wo = nn.Identity()
else:
raise ValueError(f'param_attention mode {self.param_attention} is not surpported!')
self.kv_downsample_mode = kv_downsample_mode
self.kv_per_win = kv_per_win
self.kv_downsample_ratio = kv_downsample_ratio
self.kv_downsample_kenel = kv_downsample_kernel
if self.kv_downsample_mode == 'ada_avgpool':
assert self.kv_per_win is not None
self.kv_down = nn.AdaptiveAvgPool2d(self.kv_per_win)
elif self.kv_downsample_mode == 'ada_maxpool':
assert self.kv_per_win is not None
self.kv_down = nn.AdaptiveMaxPool2d(self.kv_per_win)
elif self.kv_downsample_mode == 'maxpool':
assert self.kv_downsample_ratio is not None
self.kv_down = nn.MaxPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()
elif self.kv_downsample_mode == 'avgpool':
assert self.kv_downsample_ratio is not None
self.kv_down = nn.AvgPool2d(self.kv_downsample_ratio) if self.kv_downsample_ratio > 1 else nn.Identity()
elif self.kv_downsample_mode == 'identity': # no kv downsampling
self.kv_down = nn.Identity()
elif self.kv_downsample_mode == 'fracpool':
# assert self.kv_downsample_ratio is not None
# assert self.kv_downsample_kenel is not None
# TODO: fracpool
# 1. kernel size should be input size dependent
# 2. there is a random factor, need to avoid independent sampling for k and v
raise NotImplementedError('fracpool policy is not implemented yet!')
elif kv_downsample_mode == 'conv':
# TODO: need to consider the case where k != v so that need two downsample modules
raise NotImplementedError('conv policy is not implemented yet!')
else:
raise ValueError(f'kv_down_sample_mode {self.kv_downsaple_mode} is not surpported!')
# softmax for local attention
self.attn_act = nn.Softmax(dim=-1)
self.auto_pad=auto_pad
def forward(self, x, ret_attn_mask=False):
"""
x: NHWC tensor
Return:
NHWC tensor
"""
x = rearrange(x, "n c h w -> n h w c")
# NOTE: use padding for semantic segmentation
###################################################
if self.auto_pad:
N, H_in, W_in, C = x.size()
pad_l = pad_t = 0
pad_r = (self.n_win - W_in % self.n_win) % self.n_win
pad_b = (self.n_win - H_in % self.n_win) % self.n_win
x = F.pad(x, (0, 0, # dim=-1
pad_l, pad_r, # dim=-2
pad_t, pad_b)) # dim=-3
_, H, W, _ = x.size() # padded size
else:
N, H, W, C = x.size()
assert H%self.n_win == 0 and W%self.n_win == 0 #
###################################################
# patchify, (n, p^2, w, w, c), keep 2d window as we need 2d pooling to reduce kv size
x = rearrange(x, "n (j h) (i w) c -> n (j i) h w c", j=self.n_win, i=self.n_win)
#################qkv projection###################
# q: (n, p^2, w, w, c_qk)
# kv: (n, p^2, w, w, c_qk+c_v)
# NOTE: separte kv if there were memory leak issue caused by gather
q, kv = self.qkv(x)
# pixel-wise qkv
# q_pix: (n, p^2, w^2, c_qk)
# kv_pix: (n, p^2, h_kv*w_kv, c_qk+c_v)
q_pix = rearrange(q, 'n p2 h w c -> n p2 (h w) c')
kv_pix = self.kv_down(rearrange(kv, 'n p2 h w c -> (n p2) c h w'))
kv_pix = rearrange(kv_pix, '(n j i) c h w -> n (j i) (h w) c', j=self.n_win, i=self.n_win)
q_win, k_win = q.mean([2, 3]), kv[..., 0:self.qk_dim].mean([2, 3]) # window-wise qk, (n, p^2, c_qk), (n, p^2, c_qk)
##################side_dwconv(lepe)##################
# NOTE: call contiguous to avoid gradient warning when using ddp
lepe = self.lepe(rearrange(kv[..., self.qk_dim:], 'n (j i) h w c -> n c (j h) (i w)', j=self.n_win, i=self.n_win).contiguous())
lepe = rearrange(lepe, 'n c (j h) (i w) -> n (j h) (i w) c', j=self.n_win, i=self.n_win)
############ gather q dependent k/v #################
r_weight, r_idx = self.router(q_win, k_win) # both are (n, p^2, topk) tensors
kv_pix_sel = self.kv_gather(r_idx=r_idx, r_weight=r_weight, kv=kv_pix) #(n, p^2, topk, h_kv*w_kv, c_qk+c_v)
k_pix_sel, v_pix_sel = kv_pix_sel.split([self.qk_dim, self.dim], dim=-1)
# kv_pix_sel: (n, p^2, topk, h_kv*w_kv, c_qk)
# v_pix_sel: (n, p^2, topk, h_kv*w_kv, c_v)
######### do attention as normal ####################
k_pix_sel = rearrange(k_pix_sel, 'n p2 k w2 (m c) -> (n p2) m c (k w2)', m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_kq//m) transpose here?
v_pix_sel = rearrange(v_pix_sel, 'n p2 k w2 (m c) -> (n p2) m (k w2) c', m=self.num_heads) # flatten to BMLC, (n*p^2, m, topk*h_kv*w_kv, c_v//m)
q_pix = rearrange(q_pix, 'n p2 w2 (m c) -> (n p2) m w2 c', m=self.num_heads) # to BMLC tensor (n*p^2, m, w^2, c_qk//m)
# param-free multihead attention
attn_weight = (q_pix * self.scale) @ k_pix_sel # (n*p^2, m, w^2, c) @ (n*p^2, m, c, topk*h_kv*w_kv) -> (n*p^2, m, w^2, topk*h_kv*w_kv)
attn_weight = self.attn_act(attn_weight)
out = attn_weight @ v_pix_sel # (n*p^2, m, w^2, topk*h_kv*w_kv) @ (n*p^2, m, topk*h_kv*w_kv, c) -> (n*p^2, m, w^2, c)
out = rearrange(out, '(n j i) m (h w) c -> n (j h) (i w) (m c)', j=self.n_win, i=self.n_win,
h=H//self.n_win, w=W//self.n_win)
out = out + lepe
# output linear
out = self.wo(out)
# NOTE: use padding for semantic segmentation
# crop padded region
if self.auto_pad and (pad_r > 0 or pad_b > 0):
out = out[:, :H_in, :W_in, :].contiguous()
if ret_attn_mask:
return out, r_weight, r_idx, attn_weight
else:
return rearrange(out, "n h w c -> n c h w")
class Attention(nn.Module):
"""
vanilla attention
"""
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
"""
args:
x: NCHW tensor
return:
NCHW tensor
"""
_, _, H, W = x.size()
x = rearrange(x, 'n c h w -> n (h w) c')
#######################################
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
#######################################
x = rearrange(x, 'n (h w) c -> n c h w', h=H, w=W)
return x
class AttentionLePE(nn.Module):
"""
vanilla attention
"""
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., side_dwconv=5):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \
lambda x: torch.zeros_like(x)
def forward(self, x):
"""
args:
x: NCHW tensor
return:
NCHW tensor
"""
_, _, H, W = x.size()
x = rearrange(x, 'n c h w -> n (h w) c')
#######################################
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
lepe = self.lepe(rearrange(x, 'n (h w) c -> n c h w', h=H, w=W))
lepe = rearrange(lepe, 'n c h w -> n (h w) c')
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = x + lepe
x = self.proj(x)
x = self.proj_drop(x)
#######################################
x = rearrange(x, 'n (h w) c -> n c h w', h=H, w=W)
return x
def _grid2seq(x:Tensor, region_size:Tuple[int], num_heads:int):
"""
Args:
x: BCHW tensor
region size: int
num_heads: number of attention heads
Return:
out: rearranged x, has a shape of (bs, nhead, nregion, reg_size, head_dim)
region_h, region_w: number of regions per col/row
"""
B, C, H, W = x.size()
region_h, region_w = H//region_size[0], W//region_size[1]
x = x.view(B, num_heads, C//num_heads, region_h, region_size[0], region_w, region_size[1])
x = torch.einsum('bmdhpwq->bmhwpqd', x).flatten(2, 3).flatten(-3, -2) # (bs, nhead, nregion, reg_size, head_dim)
return x, region_h, region_w
def _seq2grid(x:Tensor, region_h:int, region_w:int, region_size:Tuple[int]):
"""
Args:
x: (bs, nhead, nregion, reg_size^2, head_dim)
Return:
x: (bs, C, H, W)
"""
bs, nhead, nregion, reg_size_square, head_dim = x.size()
x = x.view(bs, nhead, region_h, region_w, region_size[0], region_size[1], head_dim)
x = torch.einsum('bmhwpqd->bmdhpwq', x).reshape(bs, nhead*head_dim,
region_h*region_size[0], region_w*region_size[1])
return x
def regional_routing_attention_torch(
query:Tensor, key:Tensor, value:Tensor, scale:float,
region_graph:LongTensor, region_size:Tuple[int],
kv_region_size:Optional[Tuple[int]]=None,
auto_pad=True)->Tensor:
"""
Args:
query, key, value: (B, C, H, W) tensor
scale: the scale/temperature for dot product attention
region_graph: (B, nhead, h_q*w_q, topk) tensor, topk <= h_k*w_k
region_size: region/window size for queries, (rh, rw)
key_region_size: optional, if None, key_region_size=region_size
auto_pad: required to be true if the input sizes are not divisible by the region_size
Return:
output: (B, C, H, W) tensor
attn: (bs, nhead, q_nregion, reg_size, topk*kv_region_size) attention matrix
"""
kv_region_size = kv_region_size or region_size
bs, nhead, q_nregion, topk = region_graph.size()
# Auto pad to deal with any input size
q_pad_b, q_pad_r, kv_pad_b, kv_pad_r = 0, 0, 0, 0
if auto_pad:
_, _, Hq, Wq = query.size()
q_pad_b = (region_size[0] - Hq % region_size[0]) % region_size[0]
q_pad_r = (region_size[1] - Wq % region_size[1]) % region_size[1]
if (q_pad_b > 0 or q_pad_r > 0):
query = F.pad(query, (0, q_pad_r, 0, q_pad_b)) # zero padding
_, _, Hk, Wk = key.size()
kv_pad_b = (kv_region_size[0] - Hk % kv_region_size[0]) % kv_region_size[0]
kv_pad_r = (kv_region_size[1] - Wk % kv_region_size[1]) % kv_region_size[1]
if (kv_pad_r > 0 or kv_pad_b > 0):
key = F.pad(key, (0, kv_pad_r, 0, kv_pad_b)) # zero padding
value = F.pad(value, (0, kv_pad_r, 0, kv_pad_b)) # zero padding
# to sequence format, i.e. (bs, nhead, nregion, reg_size, head_dim)
query, q_region_h, q_region_w = _grid2seq(query, region_size=region_size, num_heads=nhead)
key, _, _ = _grid2seq(key, region_size=kv_region_size, num_heads=nhead)
value, _, _ = _grid2seq(value, region_size=kv_region_size, num_heads=nhead)
# gather key and values.
# TODO: is seperate gathering slower than fused one (our old version) ?
# torch.gather does not support broadcasting, hence we do it manually
bs, nhead, kv_nregion, kv_region_size, head_dim = key.size()
broadcasted_region_graph = region_graph.view(bs, nhead, q_nregion, topk, 1, 1).\
expand(-1, -1, -1, -1, kv_region_size, head_dim)
key_g = torch.gather(key.view(bs, nhead, 1, kv_nregion, kv_region_size, head_dim).\
expand(-1, -1, query.size(2), -1, -1, -1), dim=3,
index=broadcasted_region_graph) # (bs, nhead, q_nregion, topk, kv_region_size, head_dim)
value_g = torch.gather(value.view(bs, nhead, 1, kv_nregion, kv_region_size, head_dim).\
expand(-1, -1, query.size(2), -1, -1, -1), dim=3,
index=broadcasted_region_graph) # (bs, nhead, q_nregion, topk, kv_region_size, head_dim)
# token-to-token attention
# (bs, nhead, q_nregion, reg_size, head_dim) @ (bs, nhead, q_nregion, head_dim, topk*kv_region_size)
# -> (bs, nhead, q_nregion, reg_size, topk*kv_region_size)
# TODO: mask padding region
attn = (query * scale) @ key_g.flatten(-3, -2).transpose(-1, -2)
attn = torch.softmax(attn, dim=-1)
# (bs, nhead, q_nregion, reg_size, topk*kv_region_size) @ (bs, nhead, q_nregion, topk*kv_region_size, head_dim)
# -> (bs, nhead, q_nregion, reg_size, head_dim)
output = attn @ value_g.flatten(-3, -2)
# to BCHW format
output = _seq2grid(output, region_h=q_region_h, region_w=q_region_w, region_size=region_size)
# remove paddings if needed
if auto_pad and (q_pad_b > 0 or q_pad_r > 0):
output = output[:, :, :Hq, :Wq]
return output, attn
class BiLevelRoutingAttention_nchw(nn.Module):
"""Bi-Level Routing Attention that takes nchw input
Compared to legacy version, this implementation:
* removes unused args and components
* uses nchw input format to avoid frequent permutation
When the size of inputs is not divisible by the region size, there is also a numerical difference
than legacy implementation, due to:
* different way to pad the input feature map (padding after linear projection)
* different pooling behavior (count_include_pad=False)
Current implementation is more reasonable, hence we do not keep backward numerical compatiability
"""
def __init__(self, dim, num_heads=8, n_win=7, qk_scale=None, topk=4, side_dwconv=3, auto_pad=False, attn_backend='torch'):
super().__init__()
# local attention setting
self.dim = dim
self.num_heads = num_heads
assert self.dim % num_heads == 0, 'dim must be divisible by num_heads!'
self.head_dim = self.dim // self.num_heads
self.scale = qk_scale or self.dim ** -0.5 # NOTE: to be consistent with old models.
################side_dwconv (i.e. LCE in Shunted Transformer)###########
self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv//2, groups=dim) if side_dwconv > 0 else \
lambda x: torch.zeros_like(x)
################ regional routing setting #################
self.topk = topk
self.n_win = n_win # number of windows per row/col
##########################################
self.qkv_linear = nn.Conv2d(self.dim, 3*self.dim, kernel_size=1)
self.output_linear = nn.Conv2d(self.dim, self.dim, kernel_size=1)
if attn_backend == 'torch':
self.attn_fn = regional_routing_attention_torch
else:
raise ValueError('CUDA implementation is not available yet. Please stay tuned.')
def forward(self, x:Tensor, ret_attn_mask=False):
"""
Args:
x: NCHW tensor, better to be channel_last (https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html)
Return:
NCHW tensor
"""
N, C, H, W = x.size()
region_size = (H//self.n_win, W//self.n_win)
# STEP 1: linear projection
qkv = self.qkv_linear.forward(x) # ncHW
q, k, v = qkv.chunk(3, dim=1) # ncHW
# STEP 2: region-to-region routing
# NOTE: ceil_mode=True, count_include_pad=False = auto padding
# NOTE: gradients backward through token-to-token attention. See Appendix A for the intuition.
q_r = F.avg_pool2d(q.detach(), kernel_size=region_size, ceil_mode=True, count_include_pad=False)
k_r = F.avg_pool2d(k.detach(), kernel_size=region_size, ceil_mode=True, count_include_pad=False) # nchw
q_r:Tensor = q_r.permute(0, 2, 3, 1).flatten(1, 2) # n(hw)c
k_r:Tensor = k_r.flatten(2, 3) # nc(hw)
a_r = q_r @ k_r # n(hw)(hw), adj matrix of regional graph
_, idx_r = torch.topk(a_r, k=self.topk, dim=-1) # n(hw)k long tensor
idx_r:LongTensor = idx_r.unsqueeze_(1).expand(-1, self.num_heads, -1, -1)
# STEP 3: token to token attention (non-parametric function)
output, attn_mat = self.attn_fn(query=q, key=k, value=v, scale=self.scale,
region_graph=idx_r, region_size=region_size
)
output = output + self.lepe(v) # ncHW
output = self.output_linear(output) # ncHW
if ret_attn_mask:
return output, attn_mat
return output
================================================
FILE: cv-attention/CAA.py
================================================
import torch.nn as nn
def autopad(k, p=None, d=1): # kernel, padding, dilation
"""Pad to 'same' shape outputs."""
if d > 1:
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
class Conv(nn.Module):
"""Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
default_act = nn.SiLU() # default activation
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
"""Initialize Conv layer with given arguments including activation."""
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
"""Apply convolution, batch normalization and activation to input tensor."""
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
"""Perform transposed convolution of 2D data."""
return self.act(self.conv(x))
class CAA(nn.Module):
def __init__(self, ch, h_kernel_size = 11, v_kernel_size = 11) -> None:
super().__init__()
self.avg_pool = nn.AvgPool2d(7, 1, 3)
self.conv1 = Conv(ch, ch)
self.h_conv = nn.Conv2d(ch, ch, (1, h_kernel_size), 1, (0, h_kernel_size // 2), 1, ch)
self.v_conv = nn.Conv2d(ch, ch, (v_kernel_size, 1), 1, (v_kernel_size // 2, 0), 1, ch)
self.conv2 = Conv(ch, ch)
self.act = nn.Sigmoid()
def forward(self, x):
attn_factor = self.act(self.conv2(self.v_conv(self.h_conv(self.conv1(self.avg_pool(x))))))
return attn_factor * x
================================================
FILE: cv-attention/CBAM.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
class ChannelAttention(nn.Module):
def __init__(self, channel, reduction=16):
super().__init__()
self.maxpool = nn.AdaptiveMaxPool2d(1)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.se = nn.Sequential(
nn.Conv2d(channel, channel // reduction, 1, bias=False),
nn.ReLU(),
nn.Conv2d(channel // reduction, channel, 1, bias=False)
)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
max_result = self.maxpool(x)
avg_result = self.avgpool(x)
max_out = self.se(max_result)
avg_out = self.se(avg_result)
output = self.sigmoid(max_out + avg_out)
return output
class SpatialAttention(nn.Module):
def __init__(self, kernel_size=7):
super().__init__()
self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=kernel_size // 2)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
max_result, _ = torch.max(x, dim=1, keepdim=True)
avg_result = torch.mean(x, dim=1, keepdim=True)
result = torch.cat([max_result, avg_result], 1)
output = self.conv(result)
output = self.sigmoid(output)
return output
class CBAMBlock(nn.Module):
def __init__(self, channel=512, reduction=16, kernel_size=7):
super().__init__()
self.ca = ChannelAttention(channel=channel, reduction=reduction)
self.sa = SpatialAttention(kernel_size=kernel_size)
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def forward(self, x):
b, c, _, _ = x.size()
out = x * self.ca(x)
out = out * self.sa(out)
return out
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
kernel_size = input.shape[2]
cbam = CBAMBlock(channel=512, reduction=16, kernel_size=kernel_size)
output = cbam(input)
print(output.shape)
================================================
FILE: cv-attention/CPCA.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
class CPCA_ChannelAttention(nn.Module):
def __init__(self, input_channels, internal_neurons):
super(CPCA_ChannelAttention, self).__init__()
self.fc1 = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons, kernel_size=1, stride=1, bias=True)
self.fc2 = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels, kernel_size=1, stride=1, bias=True)
self.input_channels = input_channels
def forward(self, inputs):
x1 = F.adaptive_avg_pool2d(inputs, output_size=(1, 1))
x1 = self.fc1(x1)
x1 = F.relu(x1, inplace=True)
x1 = self.fc2(x1)
x1 = torch.sigmoid(x1)
x2 = F.adaptive_max_pool2d(inputs, output_size=(1, 1))
x2 = self.fc1(x2)
x2 = F.relu(x2, inplace=True)
x2 = self.fc2(x2)
x2 = torch.sigmoid(x2)
x = x1 + x2
x = x.view(-1, self.input_channels, 1, 1)
return inputs * x
class CPCA(nn.Module):
def __init__(self, channels, channelAttention_reduce=4):
super().__init__()
self.ca = CPCA_ChannelAttention(input_channels=channels, internal_neurons=channels // channelAttention_reduce)
self.dconv5_5 = nn.Conv2d(channels,channels,kernel_size=5,padding=2,groups=channels)
self.dconv1_7 = nn.Conv2d(channels,channels,kernel_size=(1,7),padding=(0,3),groups=channels)
self.dconv7_1 = nn.Conv2d(channels,channels,kernel_size=(7,1),padding=(3,0),groups=channels)
self.dconv1_11 = nn.Conv2d(channels,channels,kernel_size=(1,11),padding=(0,5),groups=channels)
self.dconv11_1 = nn.Conv2d(channels,channels,kernel_size=(11,1),padding=(5,0),groups=channels)
self.dconv1_21 = nn.Conv2d(channels,channels,kernel_size=(1,21),padding=(0,10),groups=channels)
self.dconv21_1 = nn.Conv2d(channels,channels,kernel_size=(21,1),padding=(10,0),groups=channels)
self.conv = nn.Conv2d(channels,channels,kernel_size=(1,1),padding=0)
self.act = nn.GELU()
def forward(self, inputs):
# Global Perceptron
inputs = self.conv(inputs)
inputs = self.act(inputs)
inputs = self.ca(inputs)
x_init = self.dconv5_5(inputs)
x_1 = self.dconv1_7(x_init)
x_1 = self.dconv7_1(x_1)
x_2 = self.dconv1_11(x_init)
x_2 = self.dconv11_1(x_2)
x_3 = self.dconv1_21(x_init)
x_3 = self.dconv21_1(x_3)
x = x_1 + x_2 + x_3 + x_init
spatial_att = self.conv(x)
out = spatial_att * inputs
out = self.conv(out)
return out
================================================
FILE: cv-attention/CloAttention.py
================================================
import torch
import torch.nn as nn
from efficientnet_pytorch.model import MemoryEfficientSwish
class AttnMap(nn.Module):
def __init__(self, dim):
super().__init__()
self.act_block = nn.Sequential(
nn.Conv2d(dim, dim, 1, 1, 0),
MemoryEfficientSwish(),
nn.Conv2d(dim, dim, 1, 1, 0)
)
def forward(self, x):
return self.act_block(x)
class EfficientAttention(nn.Module):
def __init__(self, dim, num_heads=8, group_split=[4, 4], kernel_sizes=[5], window_size=4,
attn_drop=0., proj_drop=0., qkv_bias=True):
super().__init__()
assert sum(group_split) == num_heads
assert len(kernel_sizes) + 1 == len(group_split)
self.dim = dim
self.num_heads = num_heads
self.dim_head = dim // num_heads
self.scalor = self.dim_head ** -0.5
self.kernel_sizes = kernel_sizes
self.window_size = window_size
self.group_split = group_split
convs = []
act_blocks = []
qkvs = []
#projs = []
for i in range(len(kernel_sizes)):
kernel_size = kernel_sizes[i]
group_head = group_split[i]
if group_head == 0:
continue
convs.append(nn.Conv2d(3*self.dim_head*group_head, 3*self.dim_head*group_head, kernel_size,
1, kernel_size//2, groups=3*self.dim_head*group_head))
act_blocks.append(AttnMap(self.dim_head*group_head))
qkvs.append(nn.Conv2d(dim, 3*group_head*self.dim_head, 1, 1, 0, bias=qkv_bias))
#projs.append(nn.Linear(group_head*self.dim_head, group_head*self.dim_head, bias=qkv_bias))
if group_split[-1] != 0:
self.global_q = nn.Conv2d(dim, group_split[-1]*self.dim_head, 1, 1, 0, bias=qkv_bias)
self.global_kv = nn.Conv2d(dim, group_split[-1]*self.dim_head*2, 1, 1, 0, bias=qkv_bias)
#self.global_proj = nn.Linear(group_split[-1]*self.dim_head, group_split[-1]*self.dim_head, bias=qkv_bias)
self.avgpool = nn.AvgPool2d(window_size, window_size) if window_size!=1 else nn.Identity()
self.convs = nn.ModuleList(convs)
self.act_blocks = nn.ModuleList(act_blocks)
self.qkvs = nn.ModuleList(qkvs)
self.proj = nn.Conv2d(dim, dim, 1, 1, 0, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj_drop = nn.Dropout(proj_drop)
def high_fre_attntion(self, x: torch.Tensor, to_qkv: nn.Module, mixer: nn.Module, attn_block: nn.Module):
'''
x: (b c h w)
'''
b, c, h, w = x.size()
qkv = to_qkv(x) #(b (3 m d) h w)
qkv = mixer(qkv).reshape(b, 3, -1, h, w).transpose(0, 1).contiguous() #(3 b (m d) h w)
q, k, v = qkv #(b (m d) h w)
attn = attn_block(q.mul(k)).mul(self.scalor)
attn = self.attn_drop(torch.tanh(attn))
res = attn.mul(v) #(b (m d) h w)
return res
def low_fre_attention(self, x : torch.Tensor, to_q: nn.Module, to_kv: nn.Module, avgpool: nn.Module):
'''
x: (b c h w)
'''
b, c, h, w = x.size()
q = to_q(x).reshape(b, -1, self.dim_head, h*w).transpose(-1, -2).contiguous() #(b m (h w) d)
kv = avgpool(x) #(b c h w)
kv = to_kv(kv).view(b, 2, -1, self.dim_head, (h*w)//(self.window_size**2)).permute(1, 0, 2, 4, 3).contiguous() #(2 b m (H W) d)
k, v = kv #(b m (H W) d)
attn = self.scalor * q @ k.transpose(-1, -2) #(b m (h w) (H W))
attn = self.attn_drop(attn.softmax(dim=-1))
res = attn @ v #(b m (h w) d)
res = res.transpose(2, 3).reshape(b, -1, h, w).contiguous()
return res
def forward(self, x: torch.Tensor):
'''
x: (b c h w)
'''
res = []
for i in range(len(self.kernel_sizes)):
if self.group_split[i] == 0:
continue
res.append(self.high_fre_attntion(x, self.qkvs[i], self.convs[i], self.act_blocks[i]))
if self.group_split[-1] != 0:
res.append(self.low_fre_attention(x, self.global_q, self.global_kv, self.avgpool))
return self.proj_drop(self.proj(torch.cat(res, dim=1)))
================================================
FILE: cv-attention/CoTAttention.py
================================================
import numpy as np
import torch
from torch import flatten, nn
from torch.nn import init
from torch.nn.modules.activation import ReLU
from torch.nn.modules.batchnorm import BatchNorm2d
from torch.nn import functional as F
class CoTAttention(nn.Module):
def __init__(self, dim=512, kernel_size=3):
super().__init__()
self.dim = dim
self.kernel_size = kernel_size
self.key_embed = nn.Sequential(
nn.Conv2d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=4, bias=False),
nn.BatchNorm2d(dim),
nn.ReLU()
)
self.value_embed = nn.Sequential(
nn.Conv2d(dim, dim, 1, bias=False),
nn.BatchNorm2d(dim)
)
factor = 4
self.attention_embed = nn.Sequential(
nn.Conv2d(2 * dim, 2 * dim // factor, 1, bias=False),
nn.BatchNorm2d(2 * dim // factor),
nn.ReLU(),
nn.Conv2d(2 * dim // factor, kernel_size * kernel_size * dim, 1)
)
def forward(self, x):
bs, c, h, w = x.shape
k1 = self.key_embed(x) # bs,c,h,w
v = self.value_embed(x).view(bs, c, -1) # bs,c,h,w
y = torch.cat([k1, x], dim=1) # bs,2c,h,w
att = self.attention_embed(y) # bs,c*k*k,h,w
att = att.reshape(bs, c, self.kernel_size * self.kernel_size, h, w)
att = att.mean(2, keepdim=False).view(bs, c, -1) # bs,c,h*w
k2 = F.softmax(att, dim=-1) * v
k2 = k2.view(bs, c, h, w)
return k1 + k2
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
cot = CoTAttention(dim=512, kernel_size=3)
output = cot(input)
print(output.shape)
================================================
FILE: cv-attention/CoordAttention.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
class h_sigmoid(nn.Module):
def __init__(self, inplace=True):
super(h_sigmoid, self).__init__()
self.relu = nn.ReLU6(inplace=inplace)
def forward(self, x):
return self.relu(x + 3) / 6
class h_swish(nn.Module):
def __init__(self, inplace=True):
super(h_swish, self).__init__()
self.sigmoid = h_sigmoid(inplace=inplace)
def forward(self, x):
return x * self.sigmoid(x)
class CoordAtt(nn.Module):
def __init__(self, inp, reduction=32):
super(CoordAtt, self).__init__()
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
mip = max(8, inp // reduction)
self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0)
self.bn1 = nn.BatchNorm2d(mip)
self.act = h_swish()
self.conv_h = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0)
self.conv_w = nn.Conv2d(mip, inp, kernel_size=1, stride=1, padding=0)
def forward(self, x):
identity = x
n, c, h, w = x.size()
x_h = self.pool_h(x)
x_w = self.pool_w(x).permute(0, 1, 3, 2)
y = torch.cat([x_h, x_w], dim=2)
y = self.conv1(y)
y = self.bn1(y)
y = self.act(y)
x_h, x_w = torch.split(y, [h, w], dim=2)
x_w = x_w.permute(0, 1, 3, 2)
a_h = self.conv_h(x_h).sigmoid()
a_w = self.conv_w(x_w).sigmoid()
out = identity * a_w * a_h
return out
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
pna = CoordAtt(inp=512)
output = pna(input)
print(output.shape)
================================================
FILE: cv-attention/DAttention.py
================================================
import torch, einops
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from timm.models.layers import trunc_normal_
class LayerNormProxy(nn.Module):
def __init__(self, dim):
super().__init__()
self.norm = nn.LayerNorm(dim)
def forward(self, x):
x = einops.rearrange(x, 'b c h w -> b h w c')
x = self.norm(x)
return einops.rearrange(x, 'b h w c -> b c h w')
class DAttention(nn.Module):
# Vision Transformer with Deformable Attention CVPR2022
# fixed_pe=True need adujust 640x640
def __init__(
self, channel, q_size, n_heads=8, n_groups=4,
attn_drop=0.0, proj_drop=0.0, stride=1,
offset_range_factor=4, use_pe=True, dwc_pe=True,
no_off=False, fixed_pe=False, ksize=3, log_cpb=False, kv_size=None
):
super().__init__()
n_head_channels = channel // n_heads
self.dwc_pe = dwc_pe
self.n_head_channels = n_head_channels
self.scale = self.n_head_channels ** -0.5
self.n_heads = n_heads
self.q_h, self.q_w = q_size
# self.kv_h, self.kv_w = kv_size
self.kv_h, self.kv_w = self.q_h // stride, self.q_w // stride
self.nc = n_head_channels * n_heads
self.n_groups = n_groups
self.n_group_channels = self.nc // self.n_groups
self.n_group_heads = self.n_heads // self.n_groups
self.use_pe = use_pe
self.fixed_pe = fixed_pe
self.no_off = no_off
self.offset_range_factor = offset_range_factor
self.ksize = ksize
self.log_cpb = log_cpb
self.stride = stride
kk = self.ksize
pad_size = kk // 2 if kk != stride else 0
self.conv_offset = nn.Sequential(
nn.Conv2d(self.n_group_channels, self.n_group_channels, kk, stride, pad_size, groups=self.n_group_channels),
LayerNormProxy(self.n_group_channels),
nn.GELU(),
nn.Conv2d(self.n_group_channels, 2, 1, 1, 0, bias=False)
)
if self.no_off:
for m in self.conv_offset.parameters():
m.requires_grad_(False)
self.proj_q = nn.Conv2d(
self.nc, self.nc,
kernel_size=1, stride=1, padding=0
)
self.proj_k = nn.Conv2d(
self.nc, self.nc,
kernel_size=1, stride=1, padding=0
)
self.proj_v = nn.Conv2d(
self.nc, self.nc,
kernel_size=1, stride=1, padding=0
)
self.proj_out = nn.Conv2d(
self.nc, self.nc,
kernel_size=1, stride=1, padding=0
)
self.proj_drop = nn.Dropout(proj_drop, inplace=True)
self.attn_drop = nn.Dropout(attn_drop, inplace=True)
if self.use_pe and not self.no_off:
if self.dwc_pe:
self.rpe_table = nn.Conv2d(
self.nc, self.nc, kernel_size=3, stride=1, padding=1, groups=self.nc)
elif self.fixed_pe:
self.rpe_table = nn.Parameter(
torch.zeros(self.n_heads, self.q_h * self.q_w, self.kv_h * self.kv_w)
)
trunc_normal_(self.rpe_table, std=0.01)
elif self.log_cpb:
# Borrowed from Swin-V2
self.rpe_table = nn.Sequential(
nn.Linear(2, 32, bias=True),
nn.ReLU(inplace=True),
nn.Linear(32, self.n_group_heads, bias=False)
)
else:
self.rpe_table = nn.Parameter(
torch.zeros(self.n_heads, self.q_h * 2 - 1, self.q_w * 2 - 1)
)
trunc_normal_(self.rpe_table, std=0.01)
else:
self.rpe_table = None
@torch.no_grad()
def _get_ref_points(self, H_key, W_key, B, dtype, device):
ref_y, ref_x = torch.meshgrid(
torch.linspace(0.5, H_key - 0.5, H_key, dtype=dtype, device=device),
torch.linspace(0.5, W_key - 0.5, W_key, dtype=dtype, device=device),
indexing='ij'
)
ref = torch.stack((ref_y, ref_x), -1)
ref[..., 1].div_(W_key - 1.0).mul_(2.0).sub_(1.0)
ref[..., 0].div_(H_key - 1.0).mul_(2.0).sub_(1.0)
ref = ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2
return ref
@torch.no_grad()
def _get_q_grid(self, H, W, B, dtype, device):
ref_y, ref_x = torch.meshgrid(
torch.arange(0, H, dtype=dtype, device=device),
torch.arange(0, W, dtype=dtype, device=device),
indexing='ij'
)
ref = torch.stack((ref_y, ref_x), -1)
ref[..., 1].div_(W - 1.0).mul_(2.0).sub_(1.0)
ref[..., 0].div_(H - 1.0).mul_(2.0).sub_(1.0)
ref = ref[None, ...].expand(B * self.n_groups, -1, -1, -1) # B * g H W 2
return ref
def forward(self, x):
B, C, H, W = x.size()
dtype, device = x.dtype, x.device
q = self.proj_q(x)
q_off = einops.rearrange(q, 'b (g c) h w -> (b g) c h w', g=self.n_groups, c=self.n_group_channels)
offset = self.conv_offset(q_off).contiguous() # B * g 2 Hg Wg
Hk, Wk = offset.size(2), offset.size(3)
n_sample = Hk * Wk
if self.offset_range_factor >= 0 and not self.no_off:
offset_range = torch.tensor([1.0 / (Hk - 1.0), 1.0 / (Wk - 1.0)], device=device).reshape(1, 2, 1, 1)
offset = offset.tanh().mul(offset_range).mul(self.offset_range_factor)
offset = einops.rearrange(offset, 'b p h w -> b h w p')
reference = self._get_ref_points(Hk, Wk, B, dtype, device)
if self.no_off:
offset = offset.fill_(0.0)
if self.offset_range_factor >= 0:
pos = offset + reference
else:
pos = (offset + reference).clamp(-1., +1.)
if self.no_off:
x_sampled = F.avg_pool2d(x, kernel_size=self.stride, stride=self.stride)
assert x_sampled.size(2) == Hk and x_sampled.size(3) == Wk, f"Size is {x_sampled.size()}"
else:
pos = pos.type(x.dtype)
x_sampled = F.grid_sample(
input=x.reshape(B * self.n_groups, self.n_group_channels, H, W),
grid=pos[..., (1, 0)], # y, x -> x, y
mode='bilinear', align_corners=True) # B * g, Cg, Hg, Wg
x_sampled = x_sampled.reshape(B, C, 1, n_sample)
q = q.reshape(B * self.n_heads, self.n_head_channels, H * W)
k = self.proj_k(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample)
v = self.proj_v(x_sampled).reshape(B * self.n_heads, self.n_head_channels, n_sample)
attn = torch.einsum('b c m, b c n -> b m n', q, k) # B * h, HW, Ns
attn = attn.mul(self.scale)
if self.use_pe and (not self.no_off):
if self.dwc_pe:
residual_lepe = self.rpe_table(q.reshape(B, C, H, W)).reshape(B * self.n_heads, self.n_head_channels, H * W)
elif self.fixed_pe:
rpe_table = self.rpe_table
attn_bias = rpe_table[None, ...].expand(B, -1, -1, -1)
attn = attn + attn_bias.reshape(B * self.n_heads, H * W, n_sample)
elif self.log_cpb:
q_grid = self._get_q_grid(H, W, B, dtype, device)
displacement = (q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul(4.0) # d_y, d_x [-8, +8]
displacement = torch.sign(displacement) * torch.log2(torch.abs(displacement) + 1.0) / np.log2(8.0)
attn_bias = self.rpe_table(displacement) # B * g, H * W, n_sample, h_g
attn = attn + einops.rearrange(attn_bias, 'b m n h -> (b h) m n', h=self.n_group_heads)
else:
rpe_table = self.rpe_table
rpe_bias = rpe_table[None, ...].expand(B, -1, -1, -1)
q_grid = self._get_q_grid(H, W, B, dtype, device)
displacement = (q_grid.reshape(B * self.n_groups, H * W, 2).unsqueeze(2) - pos.reshape(B * self.n_groups, n_sample, 2).unsqueeze(1)).mul(0.5)
attn_bias = F.grid_sample(
input=einops.rearrange(rpe_bias, 'b (g c) h w -> (b g) c h w', c=self.n_group_heads, g=self.n_groups),
grid=displacement[..., (1, 0)],
mode='bilinear', align_corners=True) # B * g, h_g, HW, Ns
attn_bias = attn_bias.reshape(B * self.n_heads, H * W, n_sample)
attn = attn + attn_bias
attn = F.softmax(attn, dim=2)
attn = self.attn_drop(attn)
out = torch.einsum('b m n, b c n -> b c m', attn, v)
if self.use_pe and self.dwc_pe:
out = out + residual_lepe
out = out.reshape(B, C, H, W)
y = self.proj_drop(self.proj_out(out))
return y
================================================
FILE: cv-attention/ECA.py
================================================
import torch, math
from torch import nn
class EfficientChannelAttention(nn.Module): # Efficient Channel Attention module
def __init__(self, c, b=1, gamma=2):
super(EfficientChannelAttention, self).__init__()
t = int(abs((math.log(c, 2) + b) / gamma))
k = t if t % 2 else t + 1
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv1 = nn.Conv1d(1, 1, kernel_size=k, padding=int(k/2), bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.avg_pool(x)
out = self.conv1(out.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)
out = self.sigmoid(out)
return out * x
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
eca = EfficientChannelAttention(c=512)
output = eca(input)
print(output.shape)
================================================
FILE: cv-attention/ELA.py
================================================
import torch.nn as nn
class ELA(nn.Module):
def __init__(self, channels) -> None:
super().__init__()
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.conv1x1 = nn.Sequential(
nn.Conv1d(channels, channels, 1),
nn.GroupNorm(16, channels),
nn.Sigmoid()
)
def forward(self, x):
b, c, h, w = x.size()
x_h = self.conv1x1(self.pool_h(x).reshape((b, c, h))).reshape((b, c, h, 1))
x_w = self.conv1x1(self.pool_w(x).reshape((b, c, w))).reshape((b, c, 1, w))
return x * x_h * x_w
================================================
FILE: cv-attention/EMA.py
================================================
import torch
from torch import nn
class EMA(nn.Module):
def __init__(self, channels, factor=8):
super(EMA, self).__init__()
self.groups = factor
assert channels // self.groups > 0
self.softmax = nn.Softmax(-1)
self.agp = nn.AdaptiveAvgPool2d((1, 1))
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)
def forward(self, x):
b, c, h, w = x.size()
group_x = x.reshape(b * self.groups, -1, h, w) # b*g,c//g,h,w
x_h = self.pool_h(group_x)
x_w = self.pool_w(group_x).permute(0, 1, 3, 2)
hw = self.conv1x1(torch.cat([x_h, x_w], dim=2))
x_h, x_w = torch.split(hw, [h, w], dim=2)
x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid())
x2 = self.conv3x3(group_x)
x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
x12 = x2.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw
x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
x22 = x1.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw
weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w)
return (group_x * weights.sigmoid()).reshape(b, c, h, w)
================================================
FILE: cv-attention/EffectiveSE.py
================================================
import torch
from torch import nn as nn
from timm.models.layers.create_act import create_act_layer
class EffectiveSEModule(nn.Module):
def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid'):
super(EffectiveSEModule, self).__init__()
self.add_maxpool = add_maxpool
self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
self.gate = create_act_layer(gate_layer)
def forward(self, x):
x_se = x.mean((2, 3), keepdim=True)
if self.add_maxpool:
# experimental codepath, may remove or change
x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True)
x_se = self.fc(x_se)
return x * self.gate(x_se)
if __name__ == '__main__':
input=torch.randn(50,512,7,7)
Ese = EffectiveSEModule(512)
output=Ese(input)
print(output.shape)
================================================
FILE: cv-attention/GAM.py
================================================
import torch.nn as nn
import torch
class GAM_Attention(nn.Module):
def __init__(self, in_channels, rate=4):
super(GAM_Attention, self).__init__()
self.channel_attention = nn.Sequential(
nn.Linear(in_channels, int(in_channels / rate)),
nn.ReLU(inplace=True),
nn.Linear(int(in_channels / rate), in_channels)
)
self.spatial_attention = nn.Sequential(
nn.Conv2d(in_channels, int(in_channels / rate), kernel_size=7, padding=3),
nn.BatchNorm2d(int(in_channels / rate)),
nn.ReLU(inplace=True),
nn.Conv2d(int(in_channels / rate), in_channels, kernel_size=7, padding=3),
nn.BatchNorm2d(in_channels)
)
def forward(self, x):
b, c, h, w = x.shape
x_permute = x.permute(0, 2, 3, 1).view(b, -1, c)
x_att_permute = self.channel_attention(x_permute).view(b, h, w, c)
x_channel_att = x_att_permute.permute(0, 3, 1, 2).sigmoid()
x = x * x_channel_att
x_spatial_att = self.spatial_attention(x).sigmoid()
out = x * x_spatial_att
return out
if __name__ == '__main__':
x = torch.randn(1, 64, 20, 20)
b, c, h, w = x.shape
net = GAM_Attention(in_channels=c)
y = net(x)
print(y.size())
================================================
FILE: cv-attention/GC.py
================================================
import torch
from torch import nn as nn
import torch.nn.functional as F
from timm.models.layers.create_act import create_act_layer, get_act_layer
from timm.models.layers import make_divisible
from timm.models.layers.mlp import ConvMlp
from timm.models.layers.norm import LayerNorm2d
class GlobalContext(nn.Module):
def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False,
rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'):
super(GlobalContext, self).__init__()
act_layer = get_act_layer(act_layer)
self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None
if rd_channels is None:
rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
if fuse_add:
self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)
else:
self.mlp_add = None
if fuse_scale:
self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)
else:
self.mlp_scale = None
self.gate = create_act_layer(gate_layer)
self.init_last_zero = init_last_zero
self.reset_parameters()
def reset_parameters(self):
if self.conv_attn is not None:
nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu')
if self.mlp_add is not None:
nn.init.zeros_(self.mlp_add.fc2.weight)
def forward(self, x):
B, C, H, W = x.shape
if self.conv_attn is not None:
attn = self.conv_attn(x).reshape(B, 1, H * W) # (B, 1, H * W)
attn = F.softmax(attn, dim=-1).unsqueeze(3) # (B, 1, H * W, 1)
context = x.reshape(B, C, H * W).unsqueeze(1) @ attn
context = context.view(B, C, 1, 1)
else:
context = x.mean(dim=(2, 3), keepdim=True)
if self.mlp_scale is not None:
mlp_x = self.mlp_scale(context)
x = x * self.gate(mlp_x)
if self.mlp_add is not None:
mlp_x = self.mlp_add(context)
x = x + mlp_x
return x
if __name__ == '__main__':
input=torch.randn(50,512,7,7)
gc = GlobalContext(512)
output=gc(input)
print(output.shape)
================================================
FILE: cv-attention/GE.py
================================================
import math, torch
from torch import nn as nn
import torch.nn.functional as F
from timm.models.layers.create_act import create_act_layer, get_act_layer
from timm.models.layers.create_conv2d import create_conv2d
from timm.models.layers import make_divisible
from timm.models.layers.mlp import ConvMlp
class GatherExcite(nn.Module):
def __init__(
self, channels, feat_size=None, extra_params=False, extent=0, use_mlp=True,
rd_ratio=1./16, rd_channels=None, rd_divisor=1, add_maxpool=False,
act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, gate_layer='sigmoid'):
super(GatherExcite, self).__init__()
self.add_maxpool = add_maxpool
act_layer = get_act_layer(act_layer)
self.extent = extent
if extra_params:
self.gather = nn.Sequential()
if extent == 0:
assert feat_size is not None, 'spatial feature size must be specified for global extent w/ params'
self.gather.add_module(
'conv1', create_conv2d(channels, channels, kernel_size=feat_size, stride=1, depthwise=True))
if norm_layer:
self.gather.add_module(f'norm1', nn.BatchNorm2d(channels))
else:
assert extent % 2 == 0
num_conv = int(math.log2(extent))
for i in range(num_conv):
self.gather.add_module(
f'conv{i + 1}',
create_conv2d(channels, channels, kernel_size=3, stride=2, depthwise=True))
if norm_layer:
self.gather.add_module(f'norm{i + 1}', nn.BatchNorm2d(channels))
if i != num_conv - 1:
self.gather.add_module(f'act{i + 1}', act_layer(inplace=True))
else:
self.gather = None
if self.extent == 0:
self.gk = 0
self.gs = 0
else:
assert extent % 2 == 0
self.gk = self.extent * 2 - 1
self.gs = self.extent
if not rd_channels:
rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
self.mlp = ConvMlp(channels, rd_channels, act_layer=act_layer) if use_mlp else nn.Identity()
self.gate = create_act_layer(gate_layer)
def forward(self, x):
size = x.shape[-2:]
if self.gather is not None:
x_ge = self.gather(x)
else:
if self.extent == 0:
# global extent
x_ge = x.mean(dim=(2, 3), keepdims=True)
if self.add_maxpool:
# experimental codepath, may remove or change
x_ge = 0.5 * x_ge + 0.5 * x.amax((2, 3), keepdim=True)
else:
x_ge = F.avg_pool2d(
x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2, count_include_pad=False)
if self.add_maxpool:
# experimental codepath, may remove or change
x_ge = 0.5 * x_ge + 0.5 * F.max_pool2d(x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2)
x_ge = self.mlp(x_ge)
if x_ge.shape[-1] != 1 or x_ge.shape[-2] != 1:
x_ge = F.interpolate(x_ge, size=size)
return x * self.gate(x_ge)
if __name__ == '__main__':
input=torch.randn(50,512,7,7)
GE = GatherExcite(512)
output=GE(input)
print(output.shape)
================================================
FILE: cv-attention/LSKA.py
================================================
import torch.nn as nn
class LSKA(nn.Module):
# Large-Separable-Kernel-Attention
# https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention/tree/main
def __init__(self, dim, k_size=7):
super().__init__()
self.k_size = k_size
if k_size == 7:
self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 3), stride=(1,1), padding=(0,(3-1)//2), groups=dim)
self.conv0v = nn.Conv2d(dim, dim, kernel_size=(3, 1), stride=(1,1), padding=((3-1)//2,0), groups=dim)
self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 3), stride=(1,1), padding=(0,2), groups=dim, dilation=2)
self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(3, 1), stride=(1,1), padding=(2,0), groups=dim, dilation=2)
elif k_size == 11:
self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 3), stride=(1,1), padding=(0,(3-1)//2), groups=dim)
self.conv0v = nn.Conv2d(dim, dim, kernel_size=(3, 1), stride=(1,1), padding=((3-1)//2,0), groups=dim)
self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,4), groups=dim, dilation=2)
self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=(4,0), groups=dim, dilation=2)
elif k_size == 23:
self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim)
self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim)
self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 7), stride=(1,1), padding=(0,9), groups=dim, dilation=3)
self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(7, 1), stride=(1,1), padding=(9,0), groups=dim, dilation=3)
elif k_size == 35:
self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim)
self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim)
self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 11), stride=(1,1), padding=(0,15), groups=dim, dilation=3)
self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(11, 1), stride=(1,1), padding=(15,0), groups=dim, dilation=3)
elif k_size == 41:
self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim)
self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim)
self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 13), stride=(1,1), padding=(0,18), groups=dim, dilation=3)
self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(13, 1), stride=(1,1), padding=(18,0), groups=dim, dilation=3)
elif k_size == 53:
self.conv0h = nn.Conv2d(dim, dim, kernel_size=(1, 5), stride=(1,1), padding=(0,(5-1)//2), groups=dim)
self.conv0v = nn.Conv2d(dim, dim, kernel_size=(5, 1), stride=(1,1), padding=((5-1)//2,0), groups=dim)
self.conv_spatial_h = nn.Conv2d(dim, dim, kernel_size=(1, 17), stride=(1,1), padding=(0,24), groups=dim, dilation=3)
self.conv_spatial_v = nn.Conv2d(dim, dim, kernel_size=(17, 1), stride=(1,1), padding=(24,0), groups=dim, dilation=3)
self.conv1 = nn.Conv2d(dim, dim, 1)
def forward(self, x):
u = x.clone()
attn = self.conv0h(x)
attn = self.conv0v(attn)
attn = self.conv_spatial_h(attn)
attn = self.conv_spatial_v(attn)
attn = self.conv1(attn)
return u * attn
================================================
FILE: cv-attention/LSKBlock.py
================================================
import torch
import torch.nn as nn
class LSKblock(nn.Module):
def __init__(self, dim):
super().__init__()
self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
self.conv_spatial = nn.Conv2d(dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3)
self.conv1 = nn.Conv2d(dim, dim//2, 1)
self.conv2 = nn.Conv2d(dim, dim//2, 1)
self.conv_squeeze = nn.Conv2d(2, 2, 7, padding=3)
self.conv = nn.Conv2d(dim//2, dim, 1)
def forward(self, x):
attn1 = self.conv0(x)
attn2 = self.conv_spatial(attn1)
attn1 = self.conv1(attn1)
attn2 = self.conv2(attn2)
attn = torch.cat([attn1, attn2], dim=1)
avg_attn = torch.mean(attn, dim=1, keepdim=True)
max_attn, _ = torch.max(attn, dim=1, keepdim=True)
agg = torch.cat([avg_attn, max_attn], dim=1)
sig = self.conv_squeeze(agg).sigmoid()
attn = attn1 * sig[:,0,:,:].unsqueeze(1) + attn2 * sig[:,1,:,:].unsqueeze(1)
attn = self.conv(attn)
return x * attn
================================================
FILE: cv-attention/MHSA.py
================================================
import torch
import torch.nn as nn
class MHSA(nn.Module):
def __init__(self, n_dims, width=14, height=14, heads=4, pos_emb=False):
super(MHSA, self).__init__()
self.heads = heads
self.query = nn.Conv2d(n_dims, n_dims, kernel_size=1)
self.key = nn.Conv2d(n_dims, n_dims, kernel_size=1)
self.value = nn.Conv2d(n_dims, n_dims, kernel_size=1)
self.pos = pos_emb
if self.pos:
self.rel_h_weight = nn.Parameter(torch.randn([1, heads, (n_dims) // heads, 1, int(height)]),
requires_grad=True)
self.rel_w_weight = nn.Parameter(torch.randn([1, heads, (n_dims) // heads, int(width), 1]),
requires_grad=True)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
n_batch, C, width, height = x.size()
q = self.query(x).view(n_batch, self.heads, C // self.heads, -1)
k = self.key(x).view(n_batch, self.heads, C // self.heads, -1)
v = self.value(x).view(n_batch, self.heads, C // self.heads, -1)
content_content = torch.matmul(q.permute(0, 1, 3, 2), k) # 1,C,h*w,h*w
c1, c2, c3, c4 = content_content.size()
if self.pos:
content_position = (self.rel_h_weight + self.rel_w_weight).view(1, self.heads, C // self.heads, -1).permute(
0, 1, 3, 2) # 1,4,1024,64
content_position = torch.matmul(content_position, q) # ([1, 4, 1024, 256])
content_position = content_position if (
content_content.shape == content_position.shape) else content_position[:, :, :c3, ]
assert (content_content.shape == content_position.shape)
energy = content_content + content_position
else:
energy = content_content
attention = self.softmax(energy)
out = torch.matmul(v, attention.permute(0, 1, 3, 2)) # 1,4,256,64
out = out.view(n_batch, C, width, height)
return out
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
mhsa = MHSA(n_dims=512)
output = mhsa(input)
print(output.shape)
================================================
FILE: cv-attention/MLCA.py
================================================
import math, torch
from torch import nn
import torch.nn.functional as F
class MLCA(nn.Module):
def __init__(self, in_size, local_size=5, gamma = 2, b = 1,local_weight=0.5):
super(MLCA, self).__init__()
# ECA 计算方法
self.local_size=local_size
self.gamma = gamma
self.b = b
t = int(abs(math.log(in_size, 2) + self.b) / self.gamma) # eca gamma=2
k = t if t % 2 else t + 1
self.conv = nn.Conv1d(1, 1, kernel_size=k, padding=(k - 1) // 2, bias=False)
self.conv_local = nn.Conv1d(1, 1, kernel_size=k, padding=(k - 1) // 2, bias=False)
self.local_weight=local_weight
self.local_arv_pool = nn.AdaptiveAvgPool2d(local_size)
self.global_arv_pool=nn.AdaptiveAvgPool2d(1)
def forward(self, x):
local_arv=self.local_arv_pool(x)
global_arv=self.global_arv_pool(local_arv)
b,c,m,n = x.shape
b_local, c_local, m_local, n_local = local_arv.shape
# (b,c,local_size,local_size) -> (b,c,local_size*local_size) -> (b,local_size*local_size,c) -> (b,1,local_size*local_size*c)
temp_local= local_arv.view(b, c_local, -1).transpose(-1, -2).reshape(b, 1, -1)
# (b,c,1,1) -> (b,c,1) -> (b,1,c)
temp_global = global_arv.view(b, c, -1).transpose(-1, -2)
y_local = self.conv_local(temp_local)
y_global = self.conv(temp_global)
# (b,c,local_size,local_size) <- (b,c,local_size*local_size)<-(b,local_size*local_size,c) <- (b,1,local_size*local_size*c)
y_local_transpose=y_local.reshape(b, self.local_size * self.local_size,c).transpose(-1,-2).view(b, c, self.local_size , self.local_size)
# (b,1,c) -> (b,c,1) -> (b,c,1,1)
y_global_transpose = y_global.transpose(-1,-2).unsqueeze(-1)
# 反池化
att_local = y_local_transpose.sigmoid()
att_global = F.adaptive_avg_pool2d(y_global_transpose.sigmoid(),[self.local_size, self.local_size])
att_all = F.adaptive_avg_pool2d(att_global*(1-self.local_weight)+(att_local*self.local_weight), [m, n])
x = x * att_all
return x
if __name__ == '__main__':
attention = MLCA(in_size=256)
inputs = torch.randn((2, 256, 16, 16))
result = attention(inputs)
print(result.size())
================================================
FILE: cv-attention/MobileViTAttention.py
================================================
from torch import nn
import torch
from einops import rearrange
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.ln = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.ln(x), **kwargs)
class FeedForward(nn.Module):
def __init__(self, dim, mlp_dim, dropout):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, mlp_dim),
nn.SiLU(),
nn.Dropout(dropout),
nn.Linear(mlp_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class Attention(nn.Module):
def __init__(self, dim, heads, head_dim, dropout):
super().__init__()
inner_dim = heads * head_dim
project_out = not (heads == 1 and head_dim == dim)
self.heads = heads
self.scale = head_dim ** -0.5
self.attend = nn.Softmax(dim=-1)
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
) if project_out else nn.Identity()
def forward(self, x):
qkv = self.to_qkv(x).chunk(3, dim=-1)
q, k, v = map(lambda t: rearrange(t, 'b p n (h d) -> b p h n d', h=self.heads), qkv)
dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
attn = self.attend(dots)
out = torch.matmul(attn, v)
out = rearrange(out, 'b p h n d -> b p n (h d)')
return self.to_out(out)
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, head_dim, mlp_dim, dropout=0.):
super().__init__()
self.layers = nn.ModuleList([])
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads, head_dim, dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout))
]))
def forward(self, x):
out = x
for att, ffn in self.layers:
out = out + att(out)
out = out + ffn(out)
return out
class MobileViTAttention(nn.Module):
def __init__(self, in_channel=3, dim=512, kernel_size=3, patch_size=7):
super().__init__()
self.ph, self.pw = patch_size, patch_size
self.conv1 = nn.Conv2d(in_channel, in_channel, kernel_size=kernel_size, padding=kernel_size // 2)
self.conv2 = nn.Conv2d(in_channel, dim, kernel_size=1)
self.trans = Transformer(dim=dim, depth=3, heads=8, head_dim=64, mlp_dim=1024)
self.conv3 = nn.Conv2d(dim, in_channel, kernel_size=1)
self.conv4 = nn.Conv2d(2 * in_channel, in_channel, kernel_size=kernel_size, padding=kernel_size // 2)
def forward(self, x):
y = x.clone() # bs,c,h,w
## Local Representation
y = self.conv2(self.conv1(x)) # bs,dim,h,w
## Global Representation
_, _, h, w = y.shape
y = rearrange(y, 'bs dim (nh ph) (nw pw) -> bs (ph pw) (nh nw) dim', ph=self.ph, pw=self.pw) # bs,h,w,dim
y = self.trans(y)
y = rearrange(y, 'bs (ph pw) (nh nw) dim -> bs dim (nh ph) (nw pw)', ph=self.ph, pw=self.pw, nh=h // self.ph,
nw=w // self.pw) # bs,dim,h,w
## Fusion
y = self.conv3(y) # bs,dim,h,w
y = torch.cat([x, y], 1) # bs,2*dim,h,w
y = self.conv4(y) # bs,c,h,w
return y
if __name__ == '__main__':
m = MobileViTAttention(in_channel=512)
input = torch.randn(1, 512, 49, 49)
output = m(input)
print(output.shape)
================================================
FILE: cv-attention/ParNetAttention.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
class ParNetAttention(nn.Module):
def __init__(self, channel=512):
super().__init__()
self.sse = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(channel, channel, kernel_size=1),
nn.Sigmoid()
)
self.conv1x1 = nn.Sequential(
nn.Conv2d(channel, channel, kernel_size=1),
nn.BatchNorm2d(channel)
)
self.conv3x3 = nn.Sequential(
nn.Conv2d(channel, channel, kernel_size=3, padding=1),
nn.BatchNorm2d(channel)
)
self.silu = nn.SiLU()
def forward(self, x):
b, c, _, _ = x.size()
x1 = self.conv1x1(x)
x2 = self.conv3x3(x)
x3 = self.sse(x) * x
y = self.silu(x1 + x2 + x3)
return y
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
pna = ParNetAttention(channel=512)
output = pna(input)
print(output.shape)
================================================
FILE: cv-attention/PolarizedSelfAttention.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
class ParallelPolarizedSelfAttention(nn.Module):
def __init__(self, channel=512):
super().__init__()
self.ch_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1))
self.ch_wq=nn.Conv2d(channel,1,kernel_size=(1,1))
self.softmax_channel=nn.Softmax(1)
self.softmax_spatial=nn.Softmax(-1)
self.ch_wz=nn.Conv2d(channel//2,channel,kernel_size=(1,1))
self.ln=nn.LayerNorm(channel)
self.sigmoid=nn.Sigmoid()
self.sp_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1))
self.sp_wq=nn.Conv2d(channel,channel//2,kernel_size=(1,1))
self.agp=nn.AdaptiveAvgPool2d((1,1))
def forward(self, x):
b, c, h, w = x.size()
#Channel-only Self-Attention
channel_wv=self.ch_wv(x) #bs,c//2,h,w
channel_wq=self.ch_wq(x) #bs,1,h,w
channel_wv=channel_wv.reshape(b,c//2,-1) #bs,c//2,h*w
channel_wq=channel_wq.reshape(b,-1,1) #bs,h*w,1
channel_wq=self.softmax_channel(channel_wq)
channel_wz=torch.matmul(channel_wv,channel_wq).unsqueeze(-1) #bs,c//2,1,1
channel_weight=self.sigmoid(self.ln(self.ch_wz(channel_wz).reshape(b,c,1).permute(0,2,1))).permute(0,2,1).reshape(b,c,1,1) #bs,c,1,1
channel_out=channel_weight*x
#Spatial-only Self-Attention
spatial_wv=self.sp_wv(x) #bs,c//2,h,w
spatial_wq=self.sp_wq(x) #bs,c//2,h,w
spatial_wq=self.agp(spatial_wq) #bs,c//2,1,1
spatial_wv=spatial_wv.reshape(b,c//2,-1) #bs,c//2,h*w
spatial_wq=spatial_wq.permute(0,2,3,1).reshape(b,1,c//2) #bs,1,c//2
spatial_wq=self.softmax_spatial(spatial_wq)
spatial_wz=torch.matmul(spatial_wq,spatial_wv) #bs,1,h*w
spatial_weight=self.sigmoid(spatial_wz.reshape(b,1,h,w)) #bs,1,h,w
spatial_out=spatial_weight*x
out=spatial_out+channel_out
return out
if __name__ == '__main__':
input=torch.randn(1,512,7,7)
psa = ParallelPolarizedSelfAttention(channel=512)
output=psa(input)
print(output.shape)
================================================
FILE: cv-attention/S2Attention.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
def spatial_shift1(x):
b, w, h, c = x.size()
x[:, 1:, :, :c // 4] = x[:, :w - 1, :, :c // 4]
x[:, :w - 1, :, c // 4:c // 2] = x[:, 1:, :, c // 4:c // 2]
x[:, :, 1:, c // 2:c * 3 // 4] = x[:, :, :h - 1, c // 2:c * 3 // 4]
x[:, :, :h - 1, 3 * c // 4:] = x[:, :, 1:, 3 * c // 4:]
return x
def spatial_shift2(x):
b, w, h, c = x.size()
x[:, :, 1:, :c // 4] = x[:, :, :h - 1, :c // 4]
x[:, :, :h - 1, c // 4:c // 2] = x[:, :, 1:, c // 4:c // 2]
x[:, 1:, :, c // 2:c * 3 // 4] = x[:, :w - 1, :, c // 2:c * 3 // 4]
x[:, :w - 1, :, 3 * c // 4:] = x[:, 1:, :, 3 * c // 4:]
return x
class SplitAttention(nn.Module):
def __init__(self, channel=512, k=3):
super().__init__()
self.channel = channel
self.k = k
self.mlp1 = nn.Linear(channel, channel, bias=False)
self.gelu = nn.GELU()
self.mlp2 = nn.Linear(channel, channel * k, bias=False)
self.softmax = nn.Softmax(1)
def forward(self, x_all):
b, k, h, w, c = x_all.shape
x_all = x_all.reshape(b, k, -1, c) # bs,k,n,c
a = torch.sum(torch.sum(x_all, 1), 1) # bs,c
hat_a = self.mlp2(self.gelu(self.mlp1(a))) # bs,kc
hat_a = hat_a.reshape(b, self.k, c) # bs,k,c
bar_a = self.softmax(hat_a) # bs,k,c
attention = bar_a.unsqueeze(-2) # #bs,k,1,c
out = attention * x_all # #bs,k,n,c
out = torch.sum(out, 1).reshape(b, h, w, c)
return out
class S2Attention(nn.Module):
def __init__(self, channels=512):
super().__init__()
self.mlp1 = nn.Linear(channels, channels * 3)
self.mlp2 = nn.Linear(channels, channels)
self.split_attention = SplitAttention()
def forward(self, x):
b, c, w, h = x.size()
x = x.permute(0, 2, 3, 1)
x = self.mlp1(x)
x1 = spatial_shift1(x[:, :, :, :c])
x2 = spatial_shift2(x[:, :, :, c:c * 2])
x3 = x[:, :, :, c * 2:]
x_all = torch.stack([x1, x2, x3], 1)
a = self.split_attention(x_all)
x = self.mlp2(a)
x = x.permute(0, 3, 1, 2)
return x
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
s2att = S2Attention(channels=512)
output = s2att(input)
print(output.shape)
================================================
FILE: cv-attention/SE.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
class SEAttention(nn.Module):
def __init__(self, channel=512,reduction=16):
super().__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel, bias=False),
nn.Sigmoid()
)
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)
if __name__ == '__main__':
input=torch.randn(50,512,7,7)
se = SEAttention(channel=512,reduction=8)
output=se(input)
print(output.shape)
================================================
FILE: cv-attention/SGE.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
class SpatialGroupEnhance(nn.Module):
def __init__(self, groups=8):
super().__init__()
self.groups=groups
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.weight=nn.Parameter(torch.zeros(1,groups,1,1))
self.bias=nn.Parameter(torch.zeros(1,groups,1,1))
self.sig=nn.Sigmoid()
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
def forward(self, x):
b, c, h,w=x.shape
x=x.view(b*self.groups,-1,h,w) #bs*g,dim//g,h,w
xn=x*self.avg_pool(x) #bs*g,dim//g,h,w
xn=xn.sum(dim=1,keepdim=True) #bs*g,1,h,w
t=xn.view(b*self.groups,-1) #bs*g,h*w
t=t-t.mean(dim=1,keepdim=True) #bs*g,h*w
std=t.std(dim=1,keepdim=True)+1e-5
t=t/std #bs*g,h*w
t=t.view(b,self.groups,h,w) #bs,g,h*w
t=t*self.weight+self.bias #bs,g,h*w
t=t.view(b*self.groups,1,h,w) #bs*g,1,h*w
x=x*self.sig(t)
x=x.view(b,c,h,w)
return x
if __name__ == '__main__':
input=torch.randn(50,512,7,7)
sge = SpatialGroupEnhance(groups=8)
output=sge(input)
print(output.shape)
================================================
FILE: cv-attention/SK.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
from collections import OrderedDict
class SKAttention(nn.Module):
def __init__(self, channel=512, kernels=[1, 3, 5, 7], reduction=16, group=1, L=32):
super().__init__()
self.d = max(L, channel // reduction)
self.convs = nn.ModuleList([])
for k in kernels:
self.convs.append(
nn.Sequential(OrderedDict([
('conv', nn.Conv2d(channel, channel, kernel_size=k, padding=k // 2, groups=group)),
('bn', nn.BatchNorm2d(channel)),
('relu', nn.ReLU())
]))
)
self.fc = nn.Linear(channel, self.d)
self.fcs = nn.ModuleList([])
for i in range(len(kernels)):
self.fcs.append(nn.Linear(self.d, channel))
self.softmax = nn.Softmax(dim=0)
def forward(self, x):
bs, c, _, _ = x.size()
conv_outs = []
### split
for conv in self.convs:
conv_outs.append(conv(x))
feats = torch.stack(conv_outs, 0) # k,bs,channel,h,w
### fuse
U = sum(conv_outs) # bs,c,h,w
### reduction channel
S = U.mean(-1).mean(-1) # bs,c
Z = self.fc(S) # bs,d
### calculate attention weight
weights = []
for fc in self.fcs:
weight = fc(Z)
weights.append(weight.view(bs, c, 1, 1)) # bs,channel
attention_weughts = torch.stack(weights, 0) # k,bs,channel,1,1
attention_weughts = self.softmax(attention_weughts) # k,bs,channel,1,1
### fuse
V = (attention_weughts * feats).sum(0)
return V
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
se = SKAttention(channel=512, reduction=8)
output = se(input)
print(output.shape)
================================================
FILE: cv-attention/SequentialSelfAttention.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
class SequentialPolarizedSelfAttention(nn.Module):
def __init__(self, channel=512):
super().__init__()
self.ch_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1))
self.ch_wq=nn.Conv2d(channel,1,kernel_size=(1,1))
self.softmax_channel=nn.Softmax(1)
self.softmax_spatial=nn.Softmax(-1)
self.ch_wz=nn.Conv2d(channel//2,channel,kernel_size=(1,1))
self.ln=nn.LayerNorm(channel)
self.sigmoid=nn.Sigmoid()
self.sp_wv=nn.Conv2d(channel,channel//2,kernel_size=(1,1))
self.sp_wq=nn.Conv2d(channel,channel//2,kernel_size=(1,1))
self.agp=nn.AdaptiveAvgPool2d((1,1))
def forward(self, x):
b, c, h, w = x.size()
#Channel-only Self-Attention
channel_wv=self.ch_wv(x) #bs,c//2,h,w
channel_wq=self.ch_wq(x) #bs,1,h,w
channel_wv=channel_wv.reshape(b,c//2,-1) #bs,c//2,h*w
channel_wq=channel_wq.reshape(b,-1,1) #bs,h*w,1
channel_wq=self.softmax_channel(channel_wq)
channel_wz=torch.matmul(channel_wv,channel_wq).unsqueeze(-1) #bs,c//2,1,1
channel_weight=self.sigmoid(self.ln(self.ch_wz(channel_wz).reshape(b,c,1).permute(0,2,1))).permute(0,2,1).reshape(b,c,1,1) #bs,c,1,1
channel_out=channel_weight*x
#Spatial-only Self-Attention
spatial_wv=self.sp_wv(channel_out) #bs,c//2,h,w
spatial_wq=self.sp_wq(channel_out) #bs,c//2,h,w
spatial_wq=self.agp(spatial_wq) #bs,c//2,1,1
spatial_wv=spatial_wv.reshape(b,c//2,-1) #bs,c//2,h*w
spatial_wq=spatial_wq.permute(0,2,3,1).reshape(b,1,c//2) #bs,1,c//2
spatial_wq=self.softmax_spatial(spatial_wq)
spatial_wz=torch.matmul(spatial_wq,spatial_wv) #bs,1,h*w
spatial_weight=self.sigmoid(spatial_wz.reshape(b,1,h,w)) #bs,1,h,w
spatial_out=spatial_weight*channel_out
return spatial_out
if __name__ == '__main__':
input=torch.randn(1,512,7,7)
psa = SequentialPolarizedSelfAttention(channel=512)
output=psa(input)
print(output.shape)
================================================
FILE: cv-attention/ShuffleAttention.py
================================================
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
class ShuffleAttention(nn.Module):
def __init__(self, channel=512, reduction=16, G=8):
super().__init__()
self.G = G
self.channel = channel
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.gn = nn.GroupNorm(channel // (2 * G), channel // (2 * G))
self.cweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1))
self.cbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1))
self.sweight = Parameter(torch.zeros(1, channel // (2 * G), 1, 1))
self.sbias = Parameter(torch.ones(1, channel // (2 * G), 1, 1))
self.sigmoid = nn.Sigmoid()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight, mode='fan_out')
if m.bias is not None:
init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
init.constant_(m.weight, 1)
init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
init.normal_(m.weight, std=0.001)
if m.bias is not None:
init.constant_(m.bias, 0)
@staticmethod
def channel_shuffle(x, groups):
b, c, h, w = x.shape
x = x.reshape(b, groups, -1, h, w)
x = x.permute(0, 2, 1, 3, 4)
# flatten
x = x.reshape(b, -1, h, w)
return x
def forward(self, x):
b, c, h, w = x.size()
# group into subfeatures
x = x.view(b * self.G, -1, h, w) # bs*G,c//G,h,w
# channel_split
x_0, x_1 = x.chunk(2, dim=1) # bs*G,c//(2*G),h,w
# channel attention
x_channel = self.avg_pool(x_0) # bs*G,c//(2*G),1,1
x_channel = self.cweight * x_channel + self.cbias # bs*G,c//(2*G),1,1
x_channel = x_0 * self.sigmoid(x_channel)
# spatial attention
x_spatial = self.gn(x_1) # bs*G,c//(2*G),h,w
x_spatial = self.sweight * x_spatial + self.sbias # bs*G,c//(2*G),h,w
x_spatial = x_1 * self.sigmoid(x_spatial) # bs*G,c//(2*G),h,w
# concatenate along channel axis
out = torch.cat([x_channel, x_spatial], dim=1) # bs*G,c//G,h,w
out = out.contiguous().view(b, -1, h, w)
# channel shuffle
out = self.channel_shuffle(out, 2)
return out
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
se = ShuffleAttention(channel=512, G=8)
output = se(input)
print(output.shape)
================================================
FILE: cv-attention/SimAM.py
================================================
import torch
import torch.nn as nn
class SimAM(torch.nn.Module):
def __init__(self, e_lambda=1e-4):
super(SimAM, self).__init__()
self.activaton = nn.Sigmoid()
self.e_lambda = e_lambda
def __repr__(self):
s = self.__class__.__name__ + '('
s += ('lambda=%f)' % self.e_lambda)
return s
@staticmethod
def get_module_name():
return "simam"
def forward(self, x):
b, c, h, w = x.size()
n = w * h - 1
x_minus_mu_square = (x - x.mean(dim=[2, 3], keepdim=True)).pow(2)
y = x_minus_mu_square / (4 * (x_minus_mu_square.sum(dim=[2, 3], keepdim=True) / n + self.e_lambda)) + 0.5
return x * self.activaton(y)
if __name__ == '__main__':
input = torch.randn(3, 64, 7, 7)
model = SimAM()
outputs = model(input)
print(outputs.shape)
================================================
FILE: cv-attention/TripletAttention.py
================================================
import torch
import torch.nn as nn
class BasicConv(nn.Module):
def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True,
bn=True, bias=False):
super(BasicConv, self).__init__()
self.out_channels = out_planes
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding,
dilation=dilation, groups=groups, bias=bias)
self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None
self.relu = nn.ReLU() if relu else None
def forward(self, x):
x = self.conv(x)
if self.bn is not None:
x = self.bn(x)
if self.relu is not None:
x = self.relu(x)
return x
class ZPool(nn.Module):
def forward(self, x):
return torch.cat((torch.max(x, 1)[0].unsqueeze(1), torch.mean(x, 1).unsqueeze(1)), dim=1)
class AttentionGate(nn.Module):
def __init__(self):
super(AttentionGate, self).__init__()
kernel_size = 7
self.compress = ZPool()
self.conv = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size - 1) // 2, relu=False)
def forward(self, x):
x_compress = self.compress(x)
x_out = self.conv(x_compress)
scale = torch.sigmoid_(x_out)
return x * scale
class TripletAttention(nn.Module):
def __init__(self, no_spatial=False):
super(TripletAttention, self).__init__()
self.cw = AttentionGate()
self.hc = AttentionGate()
self.no_spatial = no_spatial
if not no_spatial:
self.hw = AttentionGate()
def forward(self, x):
x_perm1 = x.permute(0, 2, 1, 3).contiguous()
x_out1 = self.cw(x_perm1)
x_out11 = x_out1.permute(0, 2, 1, 3).contiguous()
x_perm2 = x.permute(0, 3, 2, 1).contiguous()
x_out2 = self.hc(x_perm2)
x_out21 = x_out2.permute(0, 3, 2, 1).contiguous()
if not self.no_spatial:
x_out = self.hw(x)
x_out = 1 / 3 * (x_out + x_out11 + x_out21)
else:
x_out = 1 / 2 * (x_out11 + x_out21)
return x_out
if __name__ == '__main__':
input = torch.randn(50, 512, 7, 7)
triplet = TripletAttention()
output = triplet(input)
print(output.shape)
================================================
FILE: cv-attention/readme.md
================================================
# CV-Attention
关于CV的一些经典注意力机制代码。
目前代码格式主要用于yolov3,yolov5,yolov7,yolov8.
# Supports
| name | need_chaneel | paper |
| :----:| :----: | :----: |
| BAM | True | https://arxiv.org/pdf/1807.06514.pdf |
| CBAM | True | https://openaccess.thecvf.com/content_ECCV_2018/papers/Sanghyun_Woo_Convolutional_Block_Attention_ECCV_2018_paper.pdf |
| SE | True | https://arxiv.org/abs/1709.01507 |
| CoTAttention | True | https://arxiv.org/abs/2107.12292 |
| MobileViTAttention | True | https://arxiv.org/abs/2110.02178 |
| SimAM | False | http://proceedings.mlr.press/v139/yang21o/yang21o.pdf |
| SK | True | https://arxiv.org/pdf/1903.06586.pdf |
| ShuffleAttention | True | https://arxiv.org/pdf/2102.00240.pdf |
| S2Attention | True | https://arxiv.org/abs/2108.01072 |
| TripletAttention | False | https://arxiv.org/abs/2010.03045 |
| ECA | True | https://arxiv.org/pdf/1910.03151.pdf |
| ParNetAttention | True | https://arxiv.org/abs/2110.07641 |
| CoordAttention | True | https://arxiv.org/abs/2103.02907 |
| MHSA
Multi-Head-Self-Attention | True | https://wuch15.github.io/paper/EMNLP2019-NRMS.pdf |
| SGE | False | https://arxiv.org/pdf/1905.09646.pdf |
| A2Attention | True | https://arxiv.org/pdf/1810.11579.pdf |
| GC
Global Context Attention | True | https://arxiv.org/abs/1904.11492 |
| EffectiveSE
Effective Squeeze-Excitation | True | https://arxiv.org/abs/1911.06667 |
| GE
Gather-Excite Attention | True | https://arxiv.org/abs/1810.12348 |
| CrissCrossAttention | True | https://arxiv.org/abs/1811.11721 |
| Polarized Self-Attention | True | https://arxiv.org/abs/2107.00782 |
| Sequential Self-Attention | True | https://arxiv.org/abs/2107.00782 |
| GAM | True | https://arxiv.org/pdf/2112.05561v1.pdf |
| Biformer | True | https://arxiv.org/abs/2303.08810 |
| EMA | True | https://arxiv.org/abs/2305.13563v2 |
| CloAttention | True | https://arxiv.org/abs/2303.17803 |
| LSKBlock | True | https://arxiv.org/pdf/2303.09030.pdf |
| MLCA | True | https://www.sciencedirect.com/science/article/pii/S0952197623006267 |
| LSKA | True | https://arxiv.org/abs/2309.01439 |
| DAttention | True | https://openaccess.thecvf.com/content/CVPR2022/html/Xia_Vision_Transformer_With_Deformable_Attention_CVPR_2022_paper.html |
| ELA | True | https://arxiv.org/abs/2403.01123 |
| CAA | True | https://arxiv.org/pdf/2403.06258 |
| CPCA | True | https://arxiv.org/abs/2306.05196 |
# Install
安装命令:pip install timm einops efficientnet_pytorch -i https://pypi.tuna.tsinghua.edu.cn/simple
# Course
1. [yolov5添加注意力哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1s84y1775U) [yolov5添加注意力-补充事项-哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1hG4y1M71X)
2. [yolov7添加注意力哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1pd4y1H7BK)
3. [yolov8添加注意力哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1ZQ4y1J7oC/) [yolov8添加注意力进阶版哔哩哔哩视频教学链接](https://www.bilibili.com/video/BV1ZQ4y1J7oC/)
# Reference
https://github.com/xmu-xiaoma666/External-Attention-pytorch
https://github.com/rwightman/pytorch-image-models
https://github.com/rayleizhu/BiFormer
https://github.com/XiaLiPKU/EMANet
https://github.com/qhfan/CloFormer/tree/main
https://github.com/zcablii/LSKNet
https://github.com/wandahangFY/MLCA
https://github.com/StevenLauHKHK/Large-Separable-Kernel-Attention
https://github.com/LeapLabTHU/DAT
https://github.com/NUST-Machine-Intelligence-Laboratory/PKINet
https://github.com/Cuthbert-Huang/CPCANet
================================================
FILE: cvpr2025-deim-project.md
================================================
# 2025-SOTA目标检测模型项目(2026发论文必备项目)
鉴于目前YOLO系列模型反映的拒稿率越来越高且YOLO模型确实非常泛滥,无论是不是计算机专业、是不是小白都基本可以快速上手YOLO模型,导致计算机专业和有期刊级别要求的小伙伴日益难受,简单来说就是YOLO在学术界的红利已经基本吃透,目前开始越来越多人转CVPR2024-RTDETR,而且目前研究生毕业一年比一年难,不像以前随便结合点深度学习就可以毕业,就像越来越多人反馈,导师已经明确禁止不能用YOLO,再加上这么多年来YOLO对学术的灌水已经让审稿人出现视觉疲劳,带上了”有色”眼镜看待YOLO,所以结合以上众多原因,因此我们需要一个有一定上手难度且是顶会的模型来支撑我们后续的大小论文的工作。
PS:20250614版本更新后,本项目的dfine和cvpr2025-deimv1已经支持Ultralytics同款的配置文件形式,大大降低上手难度
### 1. 这个项目包含什么模型?
这个项目的源代码来自:[DEIM](https://github.com/ShihuaHuang95/DEIM)
其内部可以跑以下模型(以下模型支持目标检测,DFine、DEIM支持实例分割,不支持姿态检测、旋转目标检测):
1. CVPR2025-DEIM
2. ICLR2025-DFine
3. RTDETRV2
4. DEIMV2
选择这个课程,这些模型都可以改进,不限于DEIM,这些都是顶会的模型,不要说2025,就算是2026、2027都不落后!还有一个重点就是像CVPR2024-RTDETR,最小的模型也有50GFLOPs,但是现在的DEIM和DFine都有像YOLO一样的Nano大小版本的模型,变相降低了训练成本和设备要求!(建议最低12G显存的显卡起步)
### 2. 这个项目会以什么形式开展?
1. 这个项目跟以往区别比较大,我们其他改进项目都是直接提供好修改好的代码,用户不需要懂代码的情况下也可以开始做实验,甚至可以做完实验,但是这样也有一个不好的点,就是会大幅度降低上手门槛,这特别对计算机专业的同学来说是非常不利的,因此这个项目在代码工程方面,这个项目我们会有教程教大家怎么去调试程序、修改代码、添加模块。
2. 这个项目会**不定时(直播时间到时候会群里进行通知,没有硬性规定多久一次,不方便看的会有录播)**有**直播**,详细直播内容请看第三大点。
3. 这个项目会持续更新创新点,如果创新点是来源于现有的模型,还会提供对应的论文及其中文翻译版本(假设像FasterNet中的FasterBlock,会提供好对应的py文件、原论文及其中文翻译版本),用户可以根据从本课程学习到的缝合模块(代指第一点)去定制或者创新自己的网络。
4. 附带答疑群,答疑群主要答疑的内容是实验、代码操作、代码报错等相关问题(经过YOLO、RTDETR大量的经验,我没法保证每一个问题都能回复到大家,只能保证遇到过的问题会给大家提供建议和方向,当然群内的一些高频问题,我也会收集起来挑出部分出视频或者直播给大家进行解答)。
5. 如果后续有剪枝、蒸馏,不需要额外付费,本项目会包含在内,所以性价比真的非常高,YOLO改进剪枝蒸馏三件套也要200多了。
### 3. 直播内容
1. 解答群内一些高频疑问,比如很多人都会遇到的报错、或者注意点。
2. 教大家如何去做二次创新(PS:这个不是口头给大家说怎么二次创新,而是从代码的层面带大家去实践二次创新。可能这里会有同学问,那自研创新呢?你会自研模块的前提是必须要懂如何二次创新,首先这是一个过程,然后我有很多自研模块是突然有的想法或者看论文看到某些结构与之前看到的论文联合后有新的想法,所以也很难描述我为什么就想到这个结构,大多数情况下,只需要会有一定复杂度的二次创新就足够,当然自研模块有机会我也会去讲)
3. 给大家从浅到深解说一些我认为比较经典的模块,提高自己能创新新模块的能力和基础,因为很多模块都是相通的,本质没有变,只是模块上的组合体替换。(有不少人私聊我说,能不能出些你是如何结合一些现有的模块去创新的,虽然现在B站上也有不少讲创新点的,但是他们的感觉就是从头到尾读一篇代码,我看了几次之后觉得我把代码扔给GPT给我打上注释的感觉是一样的,看的时候感觉哦哦哦这样,看完后就不知所然)
### 3. 入手本项目需要注意些什么?
1. 因为本项目完全不是像之前YOLO项目这样傻瓜式操作,所以本项目有一定难度,具有以下特征的小伙伴不建议入手。(看到这里可能有人会问,为什么不考虑把DEIM、DFine、RTDETRV2都移植到Ultralytics?因为这个不确定性太大,DETR类型的模型对参数非常敏感,可能有一点参数不合适,效果就会大打折扣,但是对于这种较为复杂的模型移植过程中又很难保证一比一全过程移植)
- 未入门、100%纯小白(如果你有心学,这个不是问题)
- 不太想花太多时间去学,搞这个只是想为了水个无要求的论文就行
- 没有任何解决问题的能力(如果你有心学,这个不是问题)
- 从来不看使用文档、说明之类的(强烈不建议入手)
- 此项目上手需要时间,如果想无脑直接跑就不合适购入
最后补充!如果你具有以上特征,但又要求期刊不能太水或者不能做yolo的问题,尽早入手CVPR2024-RTDETR吧,去年没抓上,今年不能再等了,模型红利可不等人。
2. 入手前可以先去B站看一下[CVPR025-DEIM合集里面的教程](https://space.bilibili.com/286900343/lists/4909499),最起码先跑通过DEIM原始模型,能跟着视频训练和测试,然后也把合集里面的基础课程都先看一下,为后面打好基础。
3. 我认为这个不是什么不可达到的事,就看你想不想毕业了,有志者事竟成。
PS:20250614版本更新后,本项目的dfine和deim已经支持Ultralytics同款的配置文件形式,大大降低上手难度
### 4. 价格
1. 本项目价格为288,没有时效限制。(与其150、200买个YOLO纯模型改进专栏,不如288买个2025-SOTA专栏,最起码不用怕花了钱,最后做的YOLO还投不出去,还毕不了业)
2. 虚拟项目一经售出不退不换,需要入手前考虑清楚,如果你是初次入手我的项目,怕我不靠谱,可以先考虑入手个YOLO和RTDETR看下。
### 5. 项目使用问题
1. 购买本项目的使用者都会得到一个独一无二的用于解压7z的密码,到时候用于解压对应的压缩包,此密码自己妥善保管,请勿告诉他人。
2. 本项目的视频和直播回放统一都是加密视频,每个购买者都可以得到一个激活码,激活码在每个人专属的7z压缩文件内。
### 6. 项目更新公告
- 20250330
1. 初版项目发布.
- 20250413
1. 新增多个改进模块并新增模块简介,位置在engine/extre_module/module_images内。
2. 新增训练和测试阶段的进度条显示。
3. 优化tensorboard中的精度名称显示。
4. 优化输出,把重要信息换颜色显示。
5. 新增plot_train_batch_freq参数,用于控制间隔多少epoch保存第一个batch中的数据增强后的图像,默认为12。
6. 新增保存当前参数信息,会自动保存到output_dir中的args.json文件内。
7. 优化output_dir保存逻辑,当判断output_dir路径存在的时候,会自动在后缀加1,避免覆盖原先代码。
- 20250419
1. 新增verbose_type参数,用于控制使用默认还是进度条输出,默认为官方默认输出形式。
2. 新增thop计算模型计算量方式,避免calflops对于部分算子出现不支持报错的操作。
3. 完善每个模块的py文件,增加输出计算量和参数量等数值,方便用户后续调试。
4. 给DataLoader中添加pin_memory参数为True,可以在训练时候如果是数据加载成为瓶颈,可以提高速度。
5. 修复用户反馈的已知问题。
6. 新增多个改进模块。
- 20250429
1. 修复engine/extre_module/custom_nn/attention/SEAM.py模块,应该是MutilSEAM。
2. 新增一些进阶课程的视频。
3. 新增多个改进模块。
4. 修复用户反馈的已知问题。
5. 修复续训时候会新增一个保存路径的问题。
6. 修复多卡训练Stage2的时候会出现部分进程找不到权重文件的问题。
- 20250514
1. 新增一些进阶课程的视频。
2. 新增多个改进模块。
3. 修复用户反馈的已知问题。
- 20250526
1. 新增一些进阶课程的视频。
2. 新增多个改进模块。
3. 新增cache_ram参数,详细可以看userguide。
4. 修复在torch2.7.0下出现的NotImplementedError问题。
- 20250609
1. 修复新增了cache_ram功能后训练COCO数据集精度不正常的问题。
2. 修复在训练COCO数据集中数据增强的绘制BUG。
3. 新增多个改进模块。
4. 新增一些进阶课程的视频。
5. 修复用户反馈的已知问题。
- 20250614
1. 新增Ultralytics的配置文件方式,大大降低改进难度。
2. 新增一些进阶课程的视频。
3. 新增多个改进模块。
- 20250617
1. 修复配置文件中层序号有误的问题。
- 20250619
1. 修复配置文件中层序号有误的问题。
2. 新增多个改进模块。
3. 新增一些进阶课程的视频。
- 20250625
1. 修复best_stg2保存异常的问题。
2. 新增YOLOV13中的HyperACE模块。
3. 新增多个关于进阶课程的视频。
- 20250705
1. 新增多个改进模块。
2. 新增多个关于进阶课程的视频。
3. 新增20250704基础疑问解答直播回放链接。
- 20250714
1. 新增多个改进模块。
2. 新增多个关于进阶课程的视频。
3. 新增小目标检测网络架构专题一群课题直播回放。
- 20250726
1. 新增在test-only的状态下输出每个类别的'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'。
2. 新增多个改进模块。
3. 修复用户反馈的已知问题。
4. 新增一个JSON格式数据集脚本。(输出类别数和类别id、输出每个类别的实例数量)
- 20250817
1. 新增支持蒸馏学习,蒸馏学习支持断点续训使用方法跟正常训练一样。
2. 蒸馏学习支持特征蒸馏、逻辑蒸馏、特征+逻辑蒸馏 这三种方式。
3. 无论是Ultralytics配置文件方式、还是原始的代码方式都支持相互蒸馏。
4. 蒸馏学习支持控制epoch,例如只有前50epoch进行蒸馏学习,后50epoch关闭蒸馏学习。
5. 更多细节请看关于<知识蒸馏教学视频>的进阶课程。
6. 支持输出YOLO指标(Precision、Recall、F1-Score、mAP50、mAP75、mAP50-95),详细请看userguide。
7. 新增多个改进模块。
8. 新增小目标检测网络架构专题二链接。
- 20250823
1. 修复YOLO指标在一些图片没真实标签的时候报错的bug。
2. 开放逻辑蒸馏,在项目内有对应的课程。
3. 新增多个改进模块。
4. 新增<知识蒸馏教学视频>的进阶课程。
- 20250907
1. 新增多个改进模块。
2. 修复蒸馏学习中教师信息输出错误的问题。
- 20250921
1. 新增导出脚本(export.py),支持导出onnx、tensorrt模型。
2. 重构大部分输出,增加输出对应的时间、文件、函数、行数,以便用户快速定位。
3. 新增20250910直播回放链接。
4. 修复一些已知BUG。
5. 完善onnx、tensorrt模型推理脚本。
6. 支持在train.py test-only状态下中使用onnx、tensorrt模型进行验证。
7. 新增<模型导出>相关教程视频。
8. 新增多个改进模块。
9. 支持DINOV3(ConvNext、ViT)作为主干进行微调。<教程在百度云创新课题的第五点>
- 20251012
1. 移植DEIMV2到本项目,暂只支持原始的代码修改方式。
2. 更新UserGuide。
3. 新增。
4. 修复一些已知问题。
- 20251025
1. 新增DQ-DETR的模块。
2. 新增多个改进模块。
3. 新增的相关教程视频。
4. 修复一些已知问题。
- 20251102
1. 新增的相关教程视频。
2. 修复一些已知问题。
- 20251115
1. 新增以DensityMap为主导的创新课程[DFINE with Density-aware Query Selection]。
2. 修复一些已知问题。
- 20251207
1. 新增在test-only状态下,yolo-metrice支持保存混淆矩阵。
2. 新增DFine、DEIM实例分割的实现,使用相关请看进阶教程实例分割部分。
3. 更新dataset/coco_analyzer.py脚本,支持输出数据集中更多的内容,以便分析数据集的特点。
4. 新增tools/visualization/tp_fp_fn_analysis.py脚本,用于分析检测结果中的tp、fp、fn。
5. 新增多个改进模块。
6. 修复一些已知问题。
7. 新增。
8. 新增基于ByteTrack的目标跟踪,教程请看进阶教程内的<目标跟踪ByteTrack的使用教程>。
- 20251213
1. 参考CVPR2022-MaskDINO重构实例分割检测头代码。
2. 修复在ram_cache状态下实例分割数据集部分存在的BUG。
3. 重新录制实例分割部分的进阶视频。
- 20251224
1. 新增多个改进模块。
2. 修复实例分割部分已知的问题。
3. 新增以DensityMap为主导的实例分割检测头内容[DFINESeg with Density-aware Query Selection]。
4. 新增[DFINESeg with Density-aware Query Selection]的使用视频教程。
5. 更新实例分割实现讲解。
- 20251226
1. 修复一些已知问题。
2. 新增基于COCO-Tiny指标,并支持输出每类COCO-Tiny指标,详细请看UserGuide.md中的<项目内yml一些额外参数说明>。
- 20260109
1. 修复一些已知问题。
2. 新增动态路由网络模块。
3. 更新视频链接。
- 20260128
1. 修复一些已知问题。
2. 新增多个改进模块。
3. 新增动态路由网络教程视频。
4. 新增的MSBlock和GQL的教程视频。
- 20260224
1. 修复一些已知问题。
2. 新增多个改进模块。
3. compile_module的编译模块支持50系显卡。
4. 为了兼容50系用户,新版的环境统一修改成torch2.8.0,旧版本的用户不影响。
- 20260310
1. 新增diou, ciou, eiou, siou, shapeiou, piou, piou2。
2. 支持TIMM中的主干进行训练。
3. DINOV3版本支持Ultralytics版本训练。
4. 新增AAAI2026-SPJFB模块。
5. 新增TGRS2025-GLSS2D模块。
6. 新增TIP2025-CAFM模块。
7. 新增TIP2025-DWM_MSA模块。
8. 新增DynamicERF模块。
9. 新增如何使用其他IOU的操作视频。
10. 新增TIMM主干的操作视频。
11. yolo_metrice参数从默认为False改为True,代表训练过程中YOLO和COCO指标都会一并输出。
### 7. 目前已有的模块
- engine/extre_module/custom_nn/attention
1. engine/extre_module/custom_nn/attention/SEAM.py
2. CVPR2021|engine/extre_module/custom_nn/attention/ca.py
3. ICASSP2023|engine/extre_module/custom_nn/attention/ema.py
4. ICML2021|engine/extre_module/custom_nn/attention/simam.py
5. ICCV2023|engine/extre_module/custom_nn/attention/lsk.py
6. WACV2024|engine/extre_module/custom_nn/attention/DeformableLKA.py
7. engine/extre_module/custom_nn/attention/mlca.py
8. BIBM2024|engine/extre_module/custom_nn/attention/FSA.py
9. AAAI2025|engine/extre_module/custom_nn/attention/CDFA.py
10. engine/extre_module/custom_nn/attention/GLSA.py
11. TGRS2025|engine/extre_module/custom_nn/attention/MCA.py
12. CVPR2025|engine/extre_module/custom_nn/attention/CASAB.py
13. NN2025|engine/extre_module/custom_nn/attention/KSFA.py
14. TPAMI2025|engine/extre_module/custom_nn/attention/GQL.py
15. TGRS2025|engine/extre_module/custom_nn/attention/ACA.py
16. TGRS2025|engine/extre_module/custom_nn/attention/DHPF.py
17. TGRS2025|engine/extre_module/custom_nn/attention/ACAB.py
- engine/extre_module/custom_nn/block
1. engine/extre_module/custom_nn/block/RepHMS.py
2. 自研模块|engine/extre_module/custom_nn/block/rgcspelan.py
3. TPAMI2025|engine/extre_module/custom_nn/block/MANet.py
- engine/extre_module/custom_nn/conv_module
1. CVPR2021|engine/extre_module/custom_nn/conv_module/dbb.py
2. IEEETIP2024|engine/extre_module/custom_nn/conv_module/deconv.py
3. ICCV2023|engine/extre_module/custom_nn/conv_module/dynamic_snake_conv.py
4. CVPR2023|engine/extre_module/custom_nn/conv_module/pconv.py
5. AAAI2025|engine/extre_module/custom_nn/conv_module/psconv.py
6. CVPR2025|engine/extre_module/custom_nn/conv_module/ShiftwiseConv.py
7. engine/extre_module/custom_nn/conv_module/wdbb.py
8. engine/extre_module/custom_nn/conv_module/deepdbb.py
9. ECCV2024|engine/extre_module/custom_nn/conv_module/wtconv2d.py
10. CVPR2023|engine/extre_module/custom_nn/conv_module/ScConv.py
11. engine/extre_module/custom_nn/conv_module/dcnv2.py
12. CVPR2024|engine/extre_module/custom_nn/conv_module/DilatedReparamConv.py
13. engine/extre_module/custom_nn/conv_module/gConv.py
14. CVPR2024|engine/extre_module/custom_nn/conv_module/IDWC.py
15. engine/extre_module/custom_nn/conv_module/DSA.py
16. CVPR2025|engine/extre_module/custom_nn/conv_module/FDConv.py
17. CVPR2023|engine/extre_module/custom_nn/conv_module/dcnv3.py
18. CVPR2024|engine/extre_module/custom_nn/conv_module/dcnv4.py
19. CVPR2024|engine/extre_module/custom_nn/conv_module/DynamicConv.py
20. CVPR2024|engine/extre_module/custom_nn/conv_module/FADC.py
21. CVPR2023|engine/extre_module/custom_nn/conv_module/SMPConv.py
22. MIA2025|engine/extre_module/custom_nn/conv_module/FourierConv.py
23. CVPR2024|engine/extre_module/custom_nn/conv_module/SFSConv.py
24. ICCV2025|engine/extre_module/custom_nn/conv_module/MBRConv.py
25. ICCV2025|engine/extre_module/custom_nn/conv_module/ConvAttn.py
26. ICCV2025|engine/extre_module/custom_nn/conv_module/Converse2D.py
27. CVPR2025|engine/extre_module/custom_nn/conv_module/gcconv.py
28. ACCV2024|engine/extre_module/custom_nn/conv_module/RMBC.py
- engine/extre_module/custom_nn/upsample
1. CVPR2024|engine/extre_module/custom_nn/upsample/eucb.py
2. CVPR2024|engine/extre_module/custom_nn/upsample/eucb_sc.py
3. engine/extre_module/custom_nn/upsample/WaveletUnPool.py
4. ICCV2019|engine/extre_module/custom_nn/upsample/CARAFE.py
5. ICCV2023|engine/extre_module/custom_nn/upsample/DySample.py
6. ICCV2025|engine/extre_module/custom_nn/upsample/Converse2D_Up.py
7. CVPR2025|engine/extre_module/custom_nn/upsample/DSUB.py
- engine/extre_module/custom_nn/downsample
1. IEEETIP2020|engine/extre_module/custom_nn/downsample/gcnet.py
2. 自研模块|engine/extre_module/custom_nn/downsample/lawds.py
3. engine/extre_module/custom_nn/downsample/WaveletPool.py
4. engine/extre_module/custom_nn/downsample/ADown.py
5. engine/extre_module/custom_nn/downsample/YOLOV7Down.py
6. engine/extre_module/custom_nn/downsample/SPDConv.py
7. engine/extre_module/custom_nn/downsample/HWD.py
8. engine/extre_module/custom_nn/downsample/DRFD.py
9. TGRS2025|engine/extre_module/custom_nn/conv_module/FSConv.py
- engine/extre_module/custom_nn/stem
1. engine/extre_module/custom_nn/stem/SRFD.py
2. engine/extre_module/custom_nn/stem/LoG.py
3. ICCV2023|engine/extre_module/custom_nn/stem/RepStem.py
- engine/extre_module/custom_nn/featurefusion
1. 自研模块|engine/extre_module/custom_nn/featurefusion/cgfm.py
2. BMVC2024|engine/extre_module/custom_nn/featurefusion/msga.py
3. CVPR2024|engine/extre_module/custom_nn/featurefusion/mfm.py
4. IEEETIP2023|engine/extre_module/custom_nn/featurefusion/CSFCN.py
5. BIBM2024|engine/extre_module/custom_nn/featurefusion/mpca.py
6. ACMMM2024|engine/extre_module/custom_nn/featurefusion/wfu.py
7. CVPR2025|engine/extre_module/custom_nn/featurefusion/GDSAFusion.py
8. engine/extre_module/custom_nn/featurefusion/PST.py
9. TGRS2025|engine/extre_module/custom_nn/featurefusion/MSAM.py
10. INFFUS2025|engine/extre_module/custom_nn/featurefusion/DPCF.py
11. CVRP2025|engine/extre_module/custom_nn/featurefusion/LCA.py
12. TGRS2025|engine/extre_module/custom_nn/featurefusion/HFFE.py
13. TGRS2025|engine/extre_module/custom_nn/featurefusion/MFPM.py
14. TGRS2025|engine/extre_module/custom_nn/featurefusion/ERM.py
15. TIP2025|engine/extre_module/custom_nn/featurefusion/CAFM.py
- engine/extre_module/custom_nn/module
1. AAAI2025|engine/extre_module/custom_nn/module/APBottleneck.py
2. CVPR2025|engine/extre_module/custom_nn/module/efficientVIM.py
3. CVPR2023|engine/extre_module/custom_nn/module/fasterblock.py
4. CVPR2024|engine/extre_module/custom_nn/module/starblock.py
5. engine/extre_module/custom_nn/module/DWR.py
6. CVPR2024|engine/extre_module/custom_nn/module/UniRepLKBlock.py
7. CVPR2025|engine/extre_module/custom_nn/module/mambaout.py
8. AAAI2024|engine/extre_module/custom_nn/module/DynamicFilter.py
9. engine/extre_module/custom_nn/module/StripBlock.py
10. TGRS2024|engine/extre_module/custom_nn/module/elgca.py
11. CVPR2024|engine/extre_module/custom_nn/module/LEGM.py
12. ICCV2023|engine/extre_module/custom_nn/module/iRMB.py
13. TPAMI2025|engine/extre_module/custom_nn/module/MSBlock.py
14. ICLR2024|engine/extre_module/custom_nn/module/FATBlock.py
15. CVPR2024|engine/extre_module/custom_nn/module/MSCB.py
16. engine/extre_module/custom_nn/module/LEGBlock.py
17. CVPR2025|engine/extre_module/custom_nn/module/RCB.py
18. ECCV2024|engine/extre_module/custom_nn/module/JDPM.py
19. CVPR2025|engine/extre_module/custom_nn/module/vHeat.py
20. CVPR2025|engine/extre_module/custom_nn/module/EBlock.py
21. CVPR2025|engine/extre_module/custom_nn/module/DBlock.py
22. ECCV2024|engine/extre_module/custom_nn/module/FMB.py
23. CVPR2024|engine/extre_module/custom_nn/module/IDWB.py
24. ECCV2022|engine/extre_module/custom_nn/module/LFE.py
25. AAAI2025|engine/extre_module/custom_nn/module/FCM.py
26. CVPR2024|engine/extre_module/custom_nn/module/RepViTBlock.py
27. CVPR2024|engine/extre_module/custom_nn/module/PKIModule.py
28. CVPR2024|engine/extre_module/custom_nn/module/camixer.py
29. ICCV2025|engine/extre_module/custom_nn/module/ESC.py
30. CVPR2025|engine/extre_module/custom_nn/module/nnWNet.py
31. TGRS2025|engine/extre_module/custom_nn/module/ARF.py
32. AAAI2024|engine/extre_module/custom_nn/module/CFBlock.py
33. IJCV2024|engine/extre_module/custom_nn/module/FMA.py
34. engine/extre_module/custom_nn/module/LWGA.py
35. TGRS2025|engine/extre_module/custom_nn/module/CSSC.py
36. TGRS2025|engine/extre_module/custom_nn/module/CNCM.py
37. ICCV2025|engine/extre_module/custom_nn/module/HFRB.py
38. ICIP2025|engine/extre_module/custom_nn/module/EVA.py
39. CVPR2025|engine/extre_module/custom_nn/module/IEL.py
40. MICCAI2023|engine/extre_module/custom_nn/module/MFEBlock.py
41. AAAI2026|engine/extre_module/custom_nn/module/PartialNetBlock.py
42. TGRS2025|engine/extre_module/custom_nn/module/DRG.py
43. engine/extre_module/custom_nn/module/Wave2D.py
44. TGRS2025|engine/extre_module/custom_nn/module/GLGM.py
45. TGRS2025|engine/extre_module/custom_nn/module/MAC.py
46. AAAI2026|engine/extre_module/custom_nn/module/SPJFB.py
- engine/extre_module/custom_nn/neck
1. 自研模块|engine/extre_module/custom_nn/neck/FDPN.py
- engine/extre_module/custom_nn/neck_module
1. TPAMI2025|engine/extre_module/custom_nn/neck_module/HyperCompute.py
2. engine/extre_module/custom_nn/neck_module/HyperACE.py
3. engine/extre_module/custom_nn/neck_module/GoldYOLO.py
4. AAAI2025|engine/extre_module/custom_nn/neck_module/HS_FPN.py
- engine/extre_module/custom_nn/norm
1. ICML2024|engine/extre_module/custom_nn/transformer/repbn.py
2. CVPR2025|engine/extre_module/custom_nn/transformer/dyt.py
3. engine/extre_module/custom_nn/norm/derf.py
- engine/extre_module/custom_nn/transformer
1. ICLR2025|engine/extre_module/custom_nn/transformer/PolaLinearAttention.py
2. CVPR2023|engine/extre_module/custom_nn/transformer/biformer.py
3. CVPR2023|engine/extre_module/custom_nn/transformer/CascadedGroupAttention.py
4. CVPR2022|engine/extre_module/custom_nn/transformer/DAttention.py
5. ICLR2022|engine/extre_module/custom_nn/transformer/DPBAttention.py
6. CVPR2024|engine/extre_module/custom_nn/transformer/AdaptiveSparseSA.py
7. engine/extre_module/custom_nn/transformer/GSA.py
8. engine/extre_module/custom_nn/transformer/RSA.py
9. ECCV2024|engine/extre_module/custom_nn/transformer/FSSA.py
10. AAAI2025|engine/extre_module/custom_nn/transformer/DilatedGCSA.py
11. AAAI2025|engine/extre_module/custom_nn/transformer/DilatedMWSA.py
12. CVPR2024|engine/extre_module/custom_nn/transformer/SHSA.py
13. IJCAI2024|engine/extre_module/custom_nn/transformer/CTA.py
14. IJCAI2024|engine/extre_module/custom_nn/transformer/SFA.py
15. engine/extre_module/custom_nn/transformer/MSLA.py
16. ACMMM2025|engine/extre_module/custom_nn/transformer/CPIA_SA.py
17. NN2025|engine/extre_module/custom_nn/transformer/TokenSelectAttention.py
18. CVPR2025|engine/extre_module/custom_nn/transformer/TAB.py
19. TPAMI2025|engine/extre_module/custom_nn/transformer/LRSA.py
20. ICCV2025|engine/extre_module/custom_nn/transformer/MALA.py
21. ICML2023|engine/extre_module/custom_nn/transformer/MUA.py
22. ACMMM2025|engine/extre_module/custom_nn/transformer/EGSA.py
23. ACMMM2025|engine/extre_module/custom_nn/transformer/SWSA.py
24. AAAI2026|engine/extre_module/custom_nn/transformer/DHOGSA.py
25. NeurIPS2025|engine/extre_module/custom_nn/transformer/CBSA.py
26. TGRS2025|engine/extre_module/custom_nn/transformer/DPWA.py
27. TIP2025|engine/extre_module/custom_nn/transformer/DWM_MSA.py
- engine/extre_module/custom_nn/mlp
1. CVPR2024|engine/extre_module/custom_nn/mlp/ConvolutionalGLU.py
2. IJCAI2024|engine/extre_module/custom_nn/mlp/DFFN.py
3. ICLR2024|engine/extre_module/custom_nn/mlp/FMFFN.py
4. CVPR2024|engine/extre_module/custom_nn/mlp/FRFN.py
5. ECCV2024|engine/extre_module/custom_nn/mlp/EFFN.py
6. WACV2025|engine/extre_module/custom_nn/mlp/SEFN.py
7. ICLR2025|engine/extre_module/custom_nn/mlp/KAN.py
8. CVPR2025|engine/extre_module/custom_nn/mlp/EDFFN.py
9. ICVJ2024|engine/extre_module/custom_nn/mlp/DML.py
10. AAAI2026|engine/extre_module/custom_nn/mlp/DIFF.py
- engine/extre_module/custom_nn/mamba
1. AAAI2025|engine/extre_module/custom_nn/mamba/SS2D.py
2. CVPR2025|engine/extre_module/custom_nn/mamba/ASSM.py
3. CVPR2025|engine/extre_module/custom_nn/mamba/SAVSS.py
4. CVPR2025|engine/extre_module/custom_nn/mamba/MobileMamba/mobilemamba.py
5. CVPR2025|engine/extre_module/custom_nn/mamba/MaIR.py
6. TGRS2025|engine/extre_module/custom_nn/mamba/GLVSS.py
7. ICCV2025|engine/extre_module/custom_nn/mamba/VSSD.py
8. ICCV2025|engine/extre_module/custom_nn/mamba/TinyViM.py
9. INFFUS2025|engine/extre_module/custom_nn/mamba/CSI.py
10. TIP2025|engine/extre_module/custom_nn/mamba/SFMB.py
11. TGRS2025|engine/extre_module/custom_nn/mamba/GLSS.py
12. TGRS2025|engine/extre_module/custom_nn/mamba/GLSS2D.py
- engine/extre_module/custom_nn/moe
1. engine/extre_module/custom_nn/moe/moe_module.py
- engine/extre_module/custom_nn/featurepreprocess
1. TGRS2025|engine/extre_module/custom_nn/featurepreprocess/FAENet.py
- 积木模块,示例教程engine/extre_module/custom_nn/module/example.py
1. YOLOV5|C3
2. YOLOV8|C2f
3. YOLO11|C3k2
4. TPAMI2025|MANet
5. TPAMI2024|MetaFormer_Block
6. TPAMI2024+CVPR2025|MetaFormer_Mona
7. TPAMI2024+CVPR2025+WACV2025|MetaFormer_SEFN
8. TPAMI2024+CVPR2025+WACV2025|MetaFormer_Mona_SEFN
- 创新课程代码<标识着是那个课程中的代码,详细可以去看对应的课程视频>
1. 顶会中的Partial创新思想课程|engine/extre_module/innovate/CVPR2020_GhostConv.py
2. 顶会中的Partial创新思想课程|engine/extre_module/innovate/CVPR2023_PartialConv.py
3. CVPR2025-MobileMamba中的Long-Range WTB-Mamba二次创新|engine/extre_module/innovate/CVPR2025_MobileMamba.py
4. TGRS2025-HighFrequencyDirectionInjection创新思想课程|engine/extre_module/innovate/TGRS2025_HFDI.py
================================================
FILE: damo-yolo/Annotations/ReadMe.md
================================================
# 存放VOC标注格式的文件夹
================================================
FILE: damo-yolo/JPEGImages/ReadMe.md
================================================
# 存放图像的文件夹
================================================
FILE: damo-yolo/readme.md
================================================
# DAMO-YOLO的数据集处理文件
本目录下的脚本是针对与DAMO-YOLO的数据集处理脚本,支持如下:
1. VOC标注格式转换为COCO标注格式,并生成train.json,val.json,test.json.
# 使用方法
1. 把图片存放在JPEGImages中,图片后缀需要一致,比如都是jpg或者png等等,不支持混合的图片后缀格式,比如一些是jpg,一些是png。
2. 把VOC标注格式的XML文件存放在Annotations中。
3. 运行voc2coco.py,其中postfix参数是JPEGImages的图片后缀,train_ratio是训练集的比例,val_ratio是验证集的比例,剩下的就是测试集的比例。
================================================
FILE: damo-yolo/voc2coco.py
================================================
import os
import glob
import json
import shutil
import numpy as np
import xml.etree.ElementTree as ET
START_BOUNDING_BOX_ID = 1
def find_classes(path):
classes = []
for i in os.listdir(path):
try:
in_file = open(os.path.join(path, i), encoding='utf-8')
tree=ET.parse(in_file)
root = tree.getroot()
for obj in root.iter('object'):
difficult = 0
if obj.find('difficult')!=None:
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes:
classes.append(cls)
except Exception as e:
print(os.path.join(path, i), e)
return classes
def get(root, name):
return root.findall(name)
def get_and_check(root, name, length):
vars = root.findall(name)
if len(vars) == 0:
raise NotImplementedError('Can not find %s in %s.'%(name, root.tag))
if length > 0 and len(vars) != length:
raise NotImplementedError('The size of %s is supposed to be %d, but is %d.'%(name, length, len(vars)))
if length == 1:
vars = vars[0]
return vars
def convert(xml_list, json_file):
json_dict = {"info":['none'], "license":['none'], "images": [], "annotations": [], "categories": []}
categories = pre_define_categories.copy()
bnd_id = START_BOUNDING_BOX_ID
all_categories = {}
for index, line in enumerate(xml_list):
# print("Processing %s"%(line))
xml_f = line
tree = ET.parse(xml_f)
root = tree.getroot()
filename = os.path.basename(xml_f)[:-4] + f".{postfix}"
image_id = index
size = get_and_check(root, 'size', 1)
width = int(get_and_check(size, 'width', 1).text)
height = int(get_and_check(size, 'height', 1).text)
image = {'file_name': filename, 'height': height, 'width': width, 'id':image_id}
json_dict['images'].append(image)
## Cruuently we do not support segmentation
# segmented = get_and_check(root, 'segmented', 1).text
# assert segmented == '0'
for obj in get(root, 'object'):
category = get_and_check(obj, 'name', 1).text
if category in all_categories:
all_categories[category] += 1
else:
all_categories[category] = 1
if category not in categories:
if only_care_pre_define_categories:
continue
new_id = len(categories) + 1
print("[warning] category '{}' not in 'pre_define_categories'({}), create new id: {} automatically".format(category, pre_define_categories, new_id))
categories[category] = new_id
category_id = categories[category]
bndbox = get_and_check(obj, 'bndbox', 1)
xmin = int(float(get_and_check(bndbox, 'xmin', 1).text))
ymin = int(float(get_and_check(bndbox, 'ymin', 1).text))
xmax = int(float(get_and_check(bndbox, 'xmax', 1).text))
ymax = int(float(get_and_check(bndbox, 'ymax', 1).text))
# if (xmax > xmin) or (ymax > ymin):
# continue
# assert(xmax > xmin), "xmax <= xmin, {}".format(line)
# assert(ymax > ymin), "ymax <= ymin, {}".format(line)
o_width = abs(xmax - xmin)
o_height = abs(ymax - ymin)
ann = {'area': o_width*o_height, 'iscrowd': 0, 'image_id':
image_id, 'bbox':[xmin, ymin, o_width, o_height],
'category_id': category_id, 'id': bnd_id, 'ignore': 0,
'segmentation': []}
json_dict['annotations'].append(ann)
bnd_id = bnd_id + 1
for cate, cid in categories.items():
cat = {'supercategory': 'none', 'id': cid, 'name': cate}
json_dict['categories'].append(cat)
json_fp = open(json_file, 'w')
json_str = json.dumps(json_dict)
json_fp.write(json_str)
json_fp.close()
print("------------create {} done--------------".format(json_file))
print("find {} categories: {} -->>> your pre_define_categories {}: {}".format(len(all_categories), all_categories.keys(), len(pre_define_categories), pre_define_categories.keys()))
print("category: id --> {}".format(categories))
print(categories.keys())
print(categories.values())
if __name__ == '__main__':
postfix = 'jpg'
# xml标注文件夹
xml_dir = './datasets/Annotations'
# 训练数据的josn文件
save_json_train = './datasets/train.json'
# 验证数据的josn文件
save_json_val = './datasets/val.json'
# 验证数据的test文件
save_json_test = './datasets/test.json'
# 类别,如果是多个类别,往classes中添加类别名字即可,比如['dog', 'person', 'cat']
classes = []
# 是否需要先遍历全部xml文件寻找classes
get_data_classes = True
# 是否只关注classes里面的类别
only_care_pre_define_categories = False
if get_data_classes:
classes = find_classes(xml_dir)
only_care_pre_define_categories = False
pre_define_categories = {}
for i, cls in enumerate(classes):
pre_define_categories[cls] = i + 1
print(pre_define_categories)
# 训练数据集比例
train_ratio = 0.7
val_ratio = 0.1
print('xml_dir is {}'.format(xml_dir))
xml_list = glob.glob(xml_dir + "/*.xml")
xml_list = np.sort(xml_list)
# print('xml_list is {}'.format(xml_list))
np.random.seed(100)
np.random.shuffle(xml_list)
train_num = int(len(xml_list)*train_ratio)
val_num = int(len(xml_list)*val_ratio)
print('训练样本数目是 {}'.format(train_num))
print('验证样本数目是 {}'.format(val_num))
print('测试样本数目是 {}'.format(len(xml_list) - train_num - val_num))
xml_list_val = xml_list[:val_num]
xml_list_train = xml_list[val_num:train_num+val_num]
xml_list_test = xml_list[train_num+val_num:]
# 对训练数据集对应的xml进行coco转换
convert(xml_list_train, save_json_train)
# 对验证数据集的xml进行coco转换
convert(xml_list_val, save_json_val)
# 对测试数据集的xml进行coco转换
convert(xml_list_test, save_json_test)
================================================
FILE: data-offline-aug/object_detection_data_aug.py
================================================
import warnings
warnings.filterwarnings('ignore')
import os, shutil, cv2, tqdm
import numpy as np
import albumentations as A
from PIL import Image
from multiprocessing import Pool
from typing import Callable, Dict, List, Union
# https://github.com/albumentations-team/albumentations
# https://albumentations.ai/docs/api_reference/augmentations/geometric/transforms/#geometric-transforms-augmentationsgeometrictransforms:~:text=Contributing%20to%20Albumentations-,Geometric%20transforms%20(augmentations.geometric.transforms),-%C2%B6
IMAGE_PATH = 'dataset/object_detection/images'
LABEL_PATH = 'dataset/object_detection/labels'
AUG_IMAGE_PATH = 'dataset/object_detection/images_aug'
AUG_LABEL_PATH = 'dataset/object_detection/labels_aug'
SHOW_SAVE_PATH = 'results'
CLASSES = ['head', 'person']
ENHANCEMENT_LOOP = 1
ENHANCEMENT_STRATEGY = A.Compose([
A.Compose([
A.Affine(scale=[0.5, 1.5], translate_percent=[0.0, 0.3], rotate=[-360, 360], shear=[-45, 45], keep_ratio=True, p=0.5), # Augmentation to apply affine transformations to images.
A.BBoxSafeRandomCrop(erosion_rate=0.2, p=0.1), # Crop a random part of the input without loss of bboxes.
A.D4(p=0.1), # Applies one of the eight possible D4 dihedral group transformations to a square-shaped input, maintaining the square shape. These transformations correspond to the symmetries of a square, including rotations and reflections.
A.ElasticTransform(p=0.1), # Elastic deformation of images as described in [Simard2003]_ (with modifications).
A.Flip(p=0.1), # Flip the input either horizontally, vertically or both horizontally and vertically.
A.GridDistortion(p=0.1), # Applies grid distortion augmentation to images, masks, and bounding boxes. This technique involves dividing the image into a grid of cells and randomly displacing the intersection points of the grid, resulting in localized distortions.
A.Perspective(p=0.1), # Perform a random four point perspective transform of the input.
], p=1.0),
A.Compose([
A.GaussNoise(p=0.1), # Apply Gaussian noise to the input image.
A.ISONoise(p=0.1), # Apply camera sensor noise.
A.ImageCompression(quality_lower=50, quality_upper=100, p=0.1), # Decreases image quality by Jpeg, WebP compression of an image.
A.RandomBrightnessContrast(p=0.1), # Randomly change brightness and contrast of the input image.
A.RandomFog(p=0.1), # Simulates fog for the image.
A.RandomRain(p=0.1), # Adds rain effects to an image.
A.RandomSnow(p=0.1), # Bleach out some pixel values imitating snow.
A.RandomShadow(p=0.1), # Simulates shadows for the image
A.RandomSunFlare(p=0.1), # Simulates Sun Flare for the image
A.ToGray(p=0.1), # Convert the input RGB image to grayscale
], p=1.0)
# A.OneOf([
# A.GaussNoise(p=1.0), # Apply Gaussian noise to the input image.
# A.ISONoise(p=1.0), # Apply camera sensor noise.
# A.ImageCompression(quality_lower=50, quality_upper=100, p=1.0), # Decreases image quality by Jpeg, WebP compression of an image.
# A.RandomBrightnessContrast(p=1.0), # Randomly change brightness and contrast of the input image.
# A.RandomFog(p=1.0), # Simulates fog for the image.
# A.RandomRain(p=1.0), # Adds rain effects to an image.
# A.RandomSnow(p=1.0), # Bleach out some pixel values imitating snow.
# A.RandomShadow(p=1.0), # Simulates shadows for the image
# A.RandomSunFlare(p=1.0), # Simulates Sun Flare for the image
# A.ToGray(p=1.0), # Convert the input RGB image to grayscale
# ], p=1.0),
], bbox_params=A.BboxParams(format='yolo', min_visibility=0.1, label_fields=['class_labels']))
def parallelise(function: Callable, data: List, chunksize=100, verbose=True, num_workers=os.cpu_count()) -> List:
num_workers = 1 if num_workers < 1 else num_workers # Pool needs to have at least 1 worker.
pool = Pool(processes=num_workers)
results = list(
tqdm.tqdm(pool.imap(function, data, chunksize), total=len(data), disable=not verbose)
)
pool.close()
pool.join()
return results
def draw_detections(box, name, img):
height, width, _ = img.shape
xmin, ymin, xmax, ymax = list(map(int, list(box)))
# 根据图像大小调整矩形框的线宽和文本的大小
line_thickness = max(1, int(min(height, width) / 200))
font_scale = min(height, width) / 500
font_thickness = max(1, int(min(height, width) / 200))
# 根据图像大小调整文本的纵向位置
text_offset_y = int(min(height, width) / 50)
cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 0, 255), line_thickness)
cv2.putText(img, str(name), (xmin, ymin - text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 255, 0), font_thickness, lineType=cv2.LINE_AA)
return img
def show_labels(images_base_path, labels_base_path):
if os.path.exists(SHOW_SAVE_PATH):
shutil.rmtree(SHOW_SAVE_PATH)
os.makedirs(SHOW_SAVE_PATH, exist_ok=True)
for images_name in tqdm.tqdm(os.listdir(images_base_path)):
file_heads, _ = os.path.splitext(images_name)
# images_path = f'{images_base_path}/{images_name}'
images_path = os.path.join(images_base_path, images_name)
# labels_path = f'{labels_base_path}/{file_heads}.txt'
labels_path = os.path.join(labels_base_path, f'{file_heads}.txt')
if os.path.exists(labels_path):
with open(labels_path) as f:
labels = np.array(list(map(lambda x:np.array(x.strip().split(), dtype=np.float64), f.readlines())), dtype=np.float64)
images = cv2.imread(images_path)
height, width, _ = images.shape
for cls, x_center, y_center, w, h in labels:
x_center *= width
y_center *= height
w *= width
h *= height
draw_detections([x_center - w // 2, y_center - h // 2, x_center + w // 2, y_center + h // 2], CLASSES[int(cls)], images)
# cv2.imwrite(f'{SHOW_SAVE_PATH}/{images_name}', images)
cv2.imwrite(os.path.join(SHOW_SAVE_PATH, images_name), images)
print(f'{SHOW_SAVE_PATH}/{images_name} save success...')
else:
print(f'{labels_path} label file not found...')
def data_aug_single(images_name):
file_heads, postfix = os.path.splitext(images_name)
# images_path = f'{IMAGE_PATH}/{images_name}'
images_path = os.path.join(IMAGE_PATH, images_name)
# labels_path = f'{LABEL_PATH}/{file_heads}.txt'
labels_path = os.path.join(LABEL_PATH, f'{file_heads}.txt')
if os.path.exists(labels_path):
with open(labels_path) as f:
labels = np.array(list(map(lambda x:np.array(x.strip().split(), dtype=np.float64), f.readlines())), dtype=np.float64)
images = Image.open(images_path)
for i in range(ENHANCEMENT_LOOP):
# new_images_name = f'{AUG_IMAGE_PATH}/{file_heads}_{i:0>3}{postfix}'
new_images_name = os.path.join(AUG_IMAGE_PATH, f'{file_heads}_{i:0>3}{postfix}')
# new_labels_name = f'{AUG_LABEL_PATH}/{file_heads}_{i:0>3}.txt'
new_labels_name = os.path.join(AUG_LABEL_PATH, f'{file_heads}_{i:0>3}.txt')
try:
transformed = ENHANCEMENT_STRATEGY(image=np.array(images), bboxes=np.minimum(np.maximum(labels[:, 1:], 0), 1), class_labels=labels[:, 0])
except:
continue
transformed_image = transformed['image']
transformed_bboxes = transformed['bboxes']
transformed_class_labels = transformed['class_labels']
cv2.imwrite(new_images_name, cv2.cvtColor(transformed_image, cv2.COLOR_RGB2BGR))
with open(new_labels_name, 'w+') as f:
for bbox, cls in zip(transformed_bboxes, transformed_class_labels):
f.write(f'{cls} {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}\n')
print(f'{new_images_name} and {new_labels_name} save success...')
else:
print(f'{labels_path} label file not found...')
def data_aug():
if os.path.exists(AUG_IMAGE_PATH):
shutil.rmtree(AUG_IMAGE_PATH)
if os.path.exists(AUG_LABEL_PATH):
shutil.rmtree(AUG_LABEL_PATH)
os.makedirs(AUG_IMAGE_PATH, exist_ok=True)
os.makedirs(AUG_LABEL_PATH, exist_ok=True)
for images_name in tqdm.tqdm(os.listdir(IMAGE_PATH)):
data_aug_single(images_name)
if __name__ == '__main__':
# data_aug()
# show_labels(IMAGE_PATH, LABEL_PATH)
show_labels(AUG_IMAGE_PATH, AUG_LABEL_PATH)
================================================
FILE: data-offline-aug/readme.md
================================================
# data-offline-aug
### 环境
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple albumentations
### 1. object_detection_data_aug.py
目标检测数据集yolo格式离线数据增强脚本.
视频教程链接:https://www.bilibili.com/video/BV1bT421k7iq/
### 2. segment_data_aug.py
语义分割离线数据增强脚本.
视频教程链接:https://www.bilibili.com/video/BV1xi421a7Gb/
# Reference
https://github.com/albumentations-team/albumentations
================================================
FILE: data-offline-aug/segment_data_aug.py
================================================
import warnings
warnings.filterwarnings('ignore')
import os, shutil, cv2, tqdm
import numpy as np
np.random.seed(0)
import albumentations as A
from PIL import Image
from multiprocessing import Pool
from typing import Callable, Dict, List, Union
# https://github.com/albumentations-team/albumentations
def generate_color_map(num_classes):
hsv_colors = [(i * 180 // num_classes, 255, 255) for i in range(num_classes)]
rgb_colors = [[0, 0, 0]] + [cv2.cvtColor(np.uint8([[color]]), cv2.COLOR_HSV2BGR)[0][0] for color in hsv_colors]
return np.array(rgb_colors, dtype=np.uint8)
IMAGE_PATH = 'dataset/segment/images'
LABEL_PATH = 'dataset/segment/labels'
AUG_IMAGE_PATH = 'dataset/segment/images_aug'
AUG_LABEL_PATH = 'dataset/segment/labels_aug'
SHOW_SAVE_PATH = 'results'
COLORS = generate_color_map(20)
ENHANCEMENT_LOOP = 1
ENHANCEMENT_STRATEGY = A.Compose([
A.Compose([
A.Affine(scale=[0.5, 1.5], translate_percent=[0.0, 0.3], rotate=[-360, 360], shear=[-45, 45], keep_ratio=True, cval_mask=0, p=0.5), # Augmentation to apply affine transformations to images.
A.BBoxSafeRandomCrop(erosion_rate=0.2, p=0.1), # Crop a random part of the input without loss of bboxes.
A.D4(p=0.1), # Applies one of the eight possible D4 dihedral group transformations to a square-shaped input, maintaining the square shape. These transformations correspond to the symmetries of a square, including rotations and reflections.
A.ElasticTransform(p=0.1), # Elastic deformation of images as described in [Simard2003]_ (with modifications).
A.Flip(p=0.1), # Flip the input either horizontally, vertically or both horizontally and vertically.
A.GridDistortion(p=0.1), # Applies grid distortion augmentation to images, masks, and bounding boxes. This technique involves dividing the image into a grid of cells and randomly displacing the intersection points of the grid, resulting in localized distortions.
A.Perspective(p=0.1), # Perform a random four point perspective transform of the input.
], p=1.0),
A.Compose([
A.GaussNoise(p=0.1), # Apply Gaussian noise to the input image.
A.ISONoise(p=0.1), # Apply camera sensor noise.
A.ImageCompression(quality_lower=50, quality_upper=100, p=0.1), # Decreases image quality by Jpeg, WebP compression of an image.
A.RandomBrightnessContrast(p=0.1), # Randomly change brightness and contrast of the input image.
A.RandomFog(p=0.1), # Simulates fog for the image.
A.RandomRain(p=0.1), # Adds rain effects to an image.
A.RandomSnow(p=0.1), # Bleach out some pixel values imitating snow.
A.RandomShadow(p=0.1), # Simulates shadows for the image
A.RandomSunFlare(p=0.1), # Simulates Sun Flare for the image
A.ToGray(p=0.1), # Convert the input RGB image to grayscale
], p=1.0)
# A.OneOf([
# A.GaussNoise(p=1.0), # Apply Gaussian noise to the input image.
# A.ISONoise(p=1.0), # Apply camera sensor noise.
# A.ImageCompression(quality_lower=50, quality_upper=100, p=1.0), # Decreases image quality by Jpeg, WebP compression of an image.
# A.RandomBrightnessContrast(p=1.0), # Randomly change brightness and contrast of the input image.
# A.RandomFog(p=1.0), # Simulates fog for the image.
# A.RandomRain(p=1.0), # Adds rain effects to an image.
# A.RandomSnow(p=1.0), # Bleach out some pixel values imitating snow.
# A.RandomShadow(p=1.0), # Simulates shadows for the image
# A.RandomSunFlare(p=1.0), # Simulates Sun Flare for the image
# A.ToGray(p=1.0), # Convert the input RGB image to grayscale
# ], p=1.0),
], is_check_shapes=False)
def draw_segments(image, mask):
blended_image = cv2.addWeighted(image, 0.7, COLORS[mask], 0.3, 0)
return blended_image
def show_labels(images_base_path, labels_base_path):
if os.path.exists(SHOW_SAVE_PATH):
shutil.rmtree(SHOW_SAVE_PATH)
os.makedirs(SHOW_SAVE_PATH, exist_ok=True)
for images_name in tqdm.tqdm(os.listdir(images_base_path)):
file_heads, _ = os.path.splitext(images_name)
# images_path = f'{images_base_path}/{images_name}'
images_path = os.path.join(images_base_path, images_name)
# labels_path = f'{labels_base_path}/{file_heads}.png'
labels_path = os.path.join(labels_base_path, f'{file_heads}.png')
if os.path.exists(labels_path):
images = cv2.imread(images_path)
masks = np.array(Image.open(labels_path))
print(np.unique(masks))
images = draw_segments(images, masks)
cv2.imwrite(f'{SHOW_SAVE_PATH}/{images_name}', images)
print(f'{SHOW_SAVE_PATH}/{images_name} save success...')
else:
print(f'{labels_path} label file not found...')
def data_aug_single(images_name):
file_heads, postfix = os.path.splitext(images_name)
# images_path = f'{IMAGE_PATH}/{images_name}'
images_path = os.path.join(IMAGE_PATH, images_name)
# labels_path = f'{LABEL_PATH}/{file_heads}.jpg'
labels_path = os.path.join(LABEL_PATH, f'{file_heads}.jpg')
if os.path.exists(labels_path):
images = Image.open(images_path)
masks = np.array(Image.open(labels_path))
for i in range(ENHANCEMENT_LOOP):
# new_images_name = f'{AUG_IMAGE_PATH}/{file_heads}_{i:0>3}{postfix}'
new_images_name = os.path.join(AUG_IMAGE_PATH, f'{file_heads}_{i:0>3}{postfix}')
# new_labels_name = f'{AUG_LABEL_PATH}/{file_heads}_{i:0>3}.png'
new_labels_name = os.path.join(AUG_LABEL_PATH, f'{file_heads}_{i:0>3}.png')
try:
transformed = ENHANCEMENT_STRATEGY(image=np.array(images), masks=[masks])
except:
continue
transformed_image = transformed['image']
transformed_masks = transformed['masks'][0]
cv2.imwrite(new_images_name, cv2.cvtColor(transformed_image, cv2.COLOR_RGB2BGR))
Image.fromarray(np.array(transformed_masks)).save(new_labels_name)
print(f'{new_images_name} and {new_labels_name} save success...')
else:
print(f'{labels_path} label file not found...')
def data_aug():
if os.path.exists(AUG_IMAGE_PATH):
shutil.rmtree(AUG_IMAGE_PATH)
if os.path.exists(AUG_LABEL_PATH):
shutil.rmtree(AUG_LABEL_PATH)
os.makedirs(AUG_IMAGE_PATH, exist_ok=True)
os.makedirs(AUG_LABEL_PATH, exist_ok=True)
for images_name in tqdm.tqdm(os.listdir(IMAGE_PATH)):
data_aug_single(images_name)
if __name__ == '__main__':
show_labels(IMAGE_PATH, LABEL_PATH)
# show_labels(AUG_IMAGE_PATH, AUG_LABEL_PATH)
# data_aug()
================================================
FILE: mmdet-course/config/atss_r50_fpn_dyhead_1x_visdrone.py
================================================
_base_ = 'atss_r50_fpn_dyhead_1x_coco.py'
model = dict(
bbox_head=dict(
num_classes=10
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth'
# nohup python tools/train.py configs/dyhead/atss_r50_fpn_dyhead_1x_visdrone.py > atss-dyhead-visdrone.log 2>&1 & tail -f atss-dyhead-visdrone.log
# python tools/test.py configs/dyhead/atss_r50_fpn_dyhead_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/dyhead/atss_r50_fpn_dyhead_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --tta
================================================
FILE: mmdet-course/config/cascade-rcnn_r50_fpn_1x_visdrone.py
================================================
_base_ = './cascade-rcnn_r50_fpn_1x_coco.py'
# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数
model = dict(
roi_head=dict(
bbox_head=[
dict(
type='Shared2FCBBoxHead',
num_classes=10
),
dict(
type='Shared2FCBBoxHead',
num_classes=10
),
dict(
type='Shared2FCBBoxHead',
num_classes=10
),
]
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth'
# nohup python tools/train.py configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_visdrone.py > cascade-rcnn-visdrone.log 2>&1 & tail -f cascade-rcnn-visdrone.log
# python tools/test.py configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_visdrone.py work_dirs/cascade-rcnn_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_visdrone.py work_dirs/cascade-rcnn_r50_fpn_1x_visdrone/epoch_12.pth --tta
================================================
FILE: mmdet-course/config/ddq-detr-4scale_r50_8xb2-12e_visdrone.py
================================================
_base_ = 'ddq-detr-4scale_r50_8xb2-12e_coco.py'
model = dict(
bbox_head=dict(
type='DDQDETRHead',
num_classes=10
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=2,
num_workers=2,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=2,
num_workers=2,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=2,
num_workers=2,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=1000))
load_from='ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth'
# nohup python tools/train.py configs/ddq/ddq-detr-4scale_r50_8xb2-12e_visdrone.py > ddq-visdrone.log 2>&1 & tail -f ddq-visdrone.log
# python tools/test.py configs/ddq/ddq-detr-4scale_r50_8xb2-12e_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/ddq/ddq-detr-4scale_r50_8xb2-12e_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --tta
================================================
FILE: mmdet-course/config/dino-4scale_r50_8xb2-12e_visdrone.py
================================================
_base_ = 'dino-4scale_r50_8xb2-12e_coco.py'
model = dict(
bbox_head=dict(
type='DINOHead',
num_classes=10,
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=4,
num_workers=4,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=4,
num_workers=4,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=4,
num_workers=4,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=500))
load_from='dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth'
# nohup python tools/train.py configs/dino/dino-4scale_r50_8xb2-12e_visdrone.py > dino-visdrone.log 2>&1 & tail -f dino-visdrone.log
# python tools/test.py configs/dino/dino-4scale_r50_8xb2-12e_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/dino/dino-4scale_r50_8xb2-12e_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --tta
================================================
FILE: mmdet-course/config/faster-rcnn_r50_fpn_ciou_1x_visdrone.py
================================================
_base_ = 'faster-rcnn_r50_fpn_ciou_1x_coco.py'
# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数
model = dict(
roi_head=dict(
bbox_head=dict(
type='Shared2FCBBoxHead',
num_classes=10
)
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth'
# nohup python tools/train.py configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_visdrone.py > faster-rcnn-visdrone.log 2>&1 & tail -f faster-rcnn-visdrone.log
# python tools/test.py configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_visdrone.py work_dirs/faster-rcnn_r50_fpn_ciou_1x_visdrone/epoch_12.pth --tta
================================================
FILE: mmdet-course/config/gfl_r50_fpn_1x_visdrone.py
================================================
_base_ = 'gfl_r50_fpn_1x_coco.py'
# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数
model = dict(
bbox_head=dict(
num_classes=10
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth'
# nohup python tools/train.py configs/gfl/gfl_r50_fpn_1x_visdrone.py > gfl-visdrone.log 2>&1 & tail -f gfl-visdrone.log
# python tools/test.py configs/gfl/gfl_r50_fpn_1x_visdrone.py work_dirs/gfl_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/gfl/gfl_r50_fpn_1x_visdrone.py work_dirs/gfl_r50_fpn_1x_visdrone/epoch_12.pth --tta
# python tools/analysis_tools/get_flops.py configs/gfl/gfl_r50_fpn_1x_visdrone.py
================================================
FILE: mmdet-course/config/retinanet_r50_fpn_1x_visdrone.py
================================================
_base_ = 'retinanet_r50_fpn_1x_coco.py'
# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数
model = dict(
bbox_head=dict(
num_classes=10
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
# nohup python tools/train.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py > retinanet-visdrone.log 2>&1 & tail -f retinanet-visdrone.log
# python tools/test.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py work_dirs/retinanet_r50_fpn_1x_visdrone/epoch_12.pth --tta
# python tools/analysis_tools/get_flops.py configs/retinanet/retinanet_r50_fpn_1x_visdrone.py
================================================
FILE: mmdet-course/config/rtmdet_tiny_8xb32-300e_visdrone.py
================================================
_base_ = 'rtmdet_tiny_8xb32-300e_coco.py'
model = dict(
bbox_head=dict(
num_classes=10
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=16,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=16,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=16,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth'
# nohup python tools/train.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py > rtmdet-tiny-visdrone.log 2>&1 & tail -f rtmdet-tiny-visdrone.log
# python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py work_dirs/rtmdet_tiny_8xb32-300e_visdrone/epoch_300.pth --show --show-dir test_save
# python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py work_dirs/rtmdet_tiny_8xb32-300e_visdrone/epoch_300.pth --tta
# python tools/analysis_tools/get_flops.py configs/rtmdet/rtmdet_tiny_8xb32-300e_visdrone.py
================================================
FILE: mmdet-course/config/tood_r50_fpn_1x_visdrone.py
================================================
_base_ = './tood_r50_fpn_1x_coco.py'
# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数
model = dict(
bbox_head=dict(
num_classes=10
)
)
# 修改数据集相关配置
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
train_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/')))
val_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/')))
test_dataloader = dict(
batch_size=8,
num_workers=8,
dataset=dict(
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/')))
# 修改评价指标相关配置
val_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json')
test_evaluator = dict(ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json')
# optim_wrapper = dict(type='AmpOptimWrapper')
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_from='tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth'
# nohup python tools/train.py configs/tood/tood_r50_fpn_1x_visdrone.py > tood-visdrone.log 2>&1 & tail -f tood-visdrone.log
# python tools/test.py configs/tood/tood_r50_fpn_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --show --show-dir test_save
# python tools/test.py configs/tood/tood_r50_fpn_1x_visdrone.py work_dirs/tood_r50_fpn_1x_visdrone/epoch_12.pth --tta
================================================
FILE: mmdet-course/config/yolox_tiny_8xb8-300e_visdrone.py
================================================
_base_ = './yolox_tiny_8xb8-300e_coco.py'
# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数
model = dict(
bbox_head=dict(
num_classes=10
)
)
# 修改数据集相关配置
# dataset settings
data_root = '/home/hjj/Desktop/dataset/dataset_visdrone/'
dataset_type = 'CocoDataset'
metainfo = {
'classes': ('pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning-tricycle', 'bus', 'motor'),
# 'palette': [
# (220, 20, 60),
# ]
}
# Example to use different file client
# Method 1: simply set the data root and let the file I/O module
# automatically infer from prefix (not support LMDB and Memcache yet)
# data_root = 's3://openmmlab/datasets/detection/coco/'
# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
# backend_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/': 's3://openmmlab/datasets/detection/',
# 'data/': 's3://openmmlab/datasets/detection/'
# }))
backend_args = None
img_scale = (640, 640) # width, height
train_pipeline = [
dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
dict(
type='RandomAffine',
scaling_ratio_range=(0.1, 2),
# img_scale is (width, height)
border=(-img_scale[0] // 2, -img_scale[1] // 2)),
dict(
type='MixUp',
img_scale=img_scale,
ratio_range=(0.8, 1.6),
pad_val=114.0),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip', prob=0.5),
# According to the official implementation, multi-scale
# training is not considered here but in the
# 'mmdet/models/detectors/yolox.py'.
# Resize and Pad are for the last 15 epochs when Mosaic,
# RandomAffine, and MixUp are closed by YOLOXModeSwitchHook.
dict(type='Resize', scale=img_scale, keep_ratio=True),
dict(
type='Pad',
pad_to_square=True,
# If the image is three-channel, the pad value needs
# to be set separately for each channel.
pad_val=dict(img=(114.0, 114.0, 114.0))),
dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
dict(type='PackDetInputs')
]
train_dataset = dict(
# use MultiImageMixDataset wrapper to support mosaic and mixup
type='MultiImageMixDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-train/annotations/train.json',
data_prefix=dict(img='VisDrone2019-DET-train/images/'),
pipeline=[
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='LoadAnnotations', with_bbox=True)
],
filter_cfg=dict(filter_empty_gt=False, min_size=32),
backend_args=backend_args),
pipeline=train_pipeline)
test_pipeline = [
dict(type='LoadImageFromFile', backend_args=backend_args),
dict(type='Resize', scale=img_scale, keep_ratio=True),
dict(
type='Pad',
pad_to_square=True,
pad_val=dict(img=(114.0, 114.0, 114.0))),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor'))
]
train_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
val_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-val/annotations/val.json',
data_prefix=dict(img='VisDrone2019-DET-val/images/'),
test_mode=True,
pipeline=test_pipeline,
backend_args=backend_args))
test_dataloader = dict(
batch_size=16,
num_workers=8,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
metainfo=metainfo,
ann_file='VisDrone2019-DET-test-dev/annotations/test.json',
data_prefix=dict(img='VisDrone2019-DET-test-dev/images/'),
test_mode=True,
pipeline=test_pipeline,
backend_args=backend_args))
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'VisDrone2019-DET-val/annotations/val.json',
metric='bbox',
backend_args=backend_args)
test_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'VisDrone2019-DET-test-dev/annotations/test.json',
metric='bbox',
backend_args=backend_args)
default_hooks = dict(logger=dict(type='LoggerHook', interval=200))
load_form='yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth'
# nohup python tools/train.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py > yolox-tiny-visdrone.log 2>&1 & tail -f yolox-tiny-visdrone.log
# python tools/test.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py work_dirs/yolox_tiny_8xb8-300e_visdrone/epoch_300.pth --show --show-dir test_save
# python tools/test.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py work_dirs/yolox_tiny_8xb8-300e_visdrone/epoch_300.pth --tta
# python tools/analysis_tools/get_flops.py configs/yolox/yolox_tiny_8xb8-300e_visdrone.py
================================================
FILE: mmdet-course/mmdet2yolo.py
================================================
import os, torch, cv2, math, tqdm, time, shutil, argparse, json, pickle
import numpy as np
from prettytable import PrettyTable
def clip_boxes(boxes, shape):
# Clip boxes (xyxy) to image shape (height, width)
if isinstance(boxes, torch.Tensor): # faster individually
boxes[..., 0].clamp_(0, shape[1]) # x1
boxes[..., 1].clamp_(0, shape[0]) # y1
boxes[..., 2].clamp_(0, shape[1]) # x2
boxes[..., 3].clamp_(0, shape[0]) # y2
else: # np.array (faster grouped)
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
# Rescale boxes (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
boxes[..., [0, 2]] -= pad[0] # x padding
boxes[..., [1, 3]] -= pad[1] # y padding
boxes[..., :4] /= gain
clip_boxes(boxes, img0_shape)
return boxes
def box_iou(box1, box2, eps=1e-7):
"""
Calculate intersection-over-union (IoU) of boxes. Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
Based on https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
Args:
box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes.
box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes.
eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
Returns:
(torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2.
"""
# NOTE: Need .float() to get accurate iou values
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
(a1, a2), (b1, b2) = box1.float().unsqueeze(1).chunk(2, 2), box2.float().unsqueeze(0).chunk(2, 2)
inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp_(0).prod(2)
# IoU = inter / (area1 + area2 - inter)
return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
def process_batch(detections, labels, iouv):
"""
Return correct prediction matrix
Arguments:
detections (array[N, 6]), x1, y1, x2, y2, conf, class
labels (array[M, 5]), class, x1, y1, x2, y2
Returns:
correct (array[N, 10]), for 10 IoU levels
"""
correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool)
iou = box_iou(labels[:, 1:], detections[:, :4])
correct_class = labels[:, 0:1] == detections[:, 5]
for i in range(len(iouv)):
x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match
if x[0].shape[0]:
matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou]
if x[0].shape[0] > 1:
matches = matches[matches[:, 2].argsort()[::-1]]
matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
# matches = matches[matches[:, 2].argsort()[::-1]]
matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
correct[matches[:, 1].astype(int), i] = True
return torch.tensor(correct, dtype=torch.bool, device=iouv.device)
def smooth(y, f=0.05):
# Box filter of fraction f
nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd)
p = np.ones(nf // 2) # ones padding
yp = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded
return np.convolve(yp, np.ones(nf) / nf, mode='valid') # y-smoothed
def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), eps=1e-16, prefix=''):
""" Compute the average precision, given the recall and precision curves.
Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
# Arguments
tp: True positives (nparray, nx1 or nx10).
conf: Objectness value from 0-1 (nparray).
pred_cls: Predicted object classes (nparray).
target_cls: True object classes (nparray).
plot: Plot precision-recall curve at mAP@0.5
save_dir: Plot save directory
# Returns
The average precision as computed in py-faster-rcnn.
"""
# Sort by objectness
i = np.argsort(-conf)
tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
# Find unique classes
unique_classes, nt = np.unique(target_cls, return_counts=True)
nc = unique_classes.shape[0] # number of classes, number of detections
# Create Precision-Recall curve and compute AP for each class
px, py = np.linspace(0, 1, 1000), [] # for plotting
ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
for ci, c in enumerate(unique_classes):
i = pred_cls == c
n_l = nt[ci] # number of labels
n_p = i.sum() # number of predictions
if n_p == 0 or n_l == 0:
continue
# Accumulate FPs and TPs
fpc = (1 - tp[i]).cumsum(0)
tpc = tp[i].cumsum(0)
# Recall
recall = tpc / (n_l + eps) # recall curve
r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases
# Precision
precision = tpc / (tpc + fpc) # precision curve
p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score
# AP from recall-precision curve
for j in range(tp.shape[1]):
ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
if plot and j == 0:
py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5
# Compute F1 (harmonic mean of precision and recall)
f1 = 2 * p * r / (p + r + eps)
i = smooth(f1.mean(0), 0.1).argmax() # max F1 index
p, r, f1 = p[:, i], r[:, i], f1[:, i]
tp = (r * nt).round() # true positives
fp = (tp / (p + eps) - tp).round() # false positives
return tp, fp, p, r, f1, ap, unique_classes.astype(int)
def compute_ap(recall, precision):
""" Compute the average precision, given the recall and precision curves
# Arguments
recall: The recall curve (list)
precision: The precision curve (list)
# Returns
Average precision, precision curve, recall curve
"""
# Append sentinel values to beginning and end
mrec = np.concatenate(([0.0], recall, [1.0]))
mpre = np.concatenate(([1.0], precision, [0.0]))
# Compute the precision envelope
mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
# Integrate area under curve
method = 'interp' # methods: 'continuous', 'interp'
if method == 'interp':
x = np.linspace(0, 1, 101) # 101-point interp (COCO)
ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate
else: # 'continuous'
i = np.where(mrec[1:] != mrec[:-1])[0] # points where x axis (recall) changes
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve
return ap, mpre, mrec
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--label_coco', type=str, default='/home/hjj/Desktop/dataset/dataset_visdrone/test_coco.json', help='label coco path')
parser.add_argument('--pred_coco', type=str, default='runs/val/exp/predictions.json', help='pred coco path')
# parser.add_argument('--pred_coco', type=str, default='/home/hjj/Desktop/github_code/mmdetection-visdrone/work_dirs/dino-4scale_r50_8xb2-12e_visdrone/test/prediction.pickle', help='pred coco path')
parser.add_argument('--iou', type=float, default=0.7, help='iou threshold')
parser.add_argument('--conf', type=float, default=0.001, help='conf threshold')
opt = parser.parse_known_args()[0]
return opt
if __name__ == '__main__':
opt = parse_opt()
iouv = torch.linspace(0.5, 0.95, 10) # iou vector for mAP@0.5:0.95
niou = iouv.numel()
stats = []
label_coco_json_path, pred_coco_json_path = opt.label_coco, opt.pred_coco
with open(label_coco_json_path) as f:
label = json.load(f)
classes = []
for data in label['categories']:
classes.append(data['name'])
image_id_hw_dict = {}
for data in label['images']:
image_id_hw_dict[data['id']] = [data['height'], data['width']]
label_id_dict = {}
for data in tqdm.tqdm(label['annotations'], desc='Process label...'):
if data['image_id'] not in label_id_dict:
label_id_dict[data['image_id']] = []
category_id = data['category_id']
x_min, y_min, w, h = data['bbox'][0], data['bbox'][1], data['bbox'][2], data['bbox'][3]
x_max, y_max = x_min + w, y_min + h
label_id_dict[data['image_id']].append(np.array([int(category_id), x_min, y_min, x_max, y_max]))
if pred_coco_json_path.endswith('json'):
with open(pred_coco_json_path) as f:
pred = json.load(f)
pred_id_dict = {}
for data in tqdm.tqdm(pred, desc='Process pred...'):
if data['image_id'] not in pred_id_dict:
pred_id_dict[data['image_id']] = []
score = data['score']
category_id = data['category_id']
x_min, y_min, w, h = data['bbox'][0], data['bbox'][1], data['bbox'][2], data['bbox'][3]
x_max, y_max = x_min + w, y_min + h
pred_id_dict[data['image_id']].append(np.array([x_min, y_min, x_max, y_max, float(score), int(category_id)]))
else:
with open(pred_coco_json_path, 'rb') as f:
pred = pickle.load(f)
pred_id_dict = {}
for data in tqdm.tqdm(pred, desc='Process pred...'):
image_id = os.path.splitext(os.path.basename(data['img_path']))[0]
if image_id not in pred_id_dict:
pred_id_dict[image_id] = []
for i in range(data['pred_instances']['labels'].size(0)):
score = data['pred_instances']['scores'][i]
category_id = data['pred_instances']['labels'][i]
bboxes = data['pred_instances']['bboxes'][i]
x_min, y_min, x_max, y_max = bboxes.cpu().detach().numpy()
# x_min, x_max = x_min / data['scale_factor'][0], x_max / data['scale_factor'][0]
# y_min, y_max = y_min / data['scale_factor'][1], y_max / data['scale_factor'][1]
pred_id_dict[image_id].append(np.array([x_min, y_min, x_max, y_max, float(score), int(category_id)]))
for idx, image_id in enumerate(tqdm.tqdm(list(image_id_hw_dict.keys()), desc="Cal mAP...")):
label = np.array(label_id_dict[image_id])
if image_id not in pred_id_dict:
pred = np.empty((0, 6))
else:
pred = torch.from_numpy(np.array(pred_id_dict[image_id]))
nl, npr = label.shape[0], pred.shape[0]
correct = torch.zeros(npr, niou, dtype=torch.bool)
if npr == 0:
if nl:
stats.append((correct, *torch.zeros((2, 0)), torch.from_numpy(label[:, 0])))
continue
if nl:
correct = process_batch(pred, torch.from_numpy(label), iouv)
stats.append((correct, pred[:, 4], pred[:, 5], torch.from_numpy(label[:, 0])))
stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*stats)]
tp, fp, p, r, f1, ap, ap_class = ap_per_class(*stats)
print(f'precision:{p}')
print(f'recall:{r}')
print(f'mAP@0.5:{ap[:, 0]}')
table = PrettyTable()
table.title = f"Metrice"
table.field_names = ["Classes", 'Precision', 'Recall', 'mAP50', 'mAP50-95']
table.add_row(['all', f'{np.mean(p):.3f}', f'{np.mean(r):.3f}', f'{np.mean(ap[:, 0]):.3f}', f'{np.mean(ap):.3f}'])
for cls_idx, classes in enumerate(classes):
table.add_row([classes, f'{p[cls_idx]:.3f}', f'{r[cls_idx]:.3f}', f'{ap[cls_idx, 0]:.3f}', f'{ap[cls_idx, :].mean():.3f}'])
print(table)
================================================
FILE: mmdet-course/readme.md
================================================
# mmdet使用教程
### mmdet教程命令
1. conda create -n mmdet_py39 python=3.9 anaconda
2. https://mmdetection.readthedocs.io/en/latest/get_started.html
3. https://pytorch.org/get-started/previous-versions/
pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
4. https://mmdetection.readthedocs.io/zh-cn/latest/user_guides/train.html#id7
### mmdet运行命令
1. 训练
python tools/train.py
2. 测试
python tools/test.py --out
3. 计算量、参数量计算脚本
python tools/analysis_tools/get_flops.py
4. 推理时间、fps、gpu memory计算脚本
python tools/analysis_tools/benchmark.py --checkpoint --task inference --fuse-conv-bn
5. 绘制曲线图脚本
python tools/analysis_tools/analyze_logs.py plot_curve --keys --legend